diff options
71 files changed, 5273 insertions, 5029 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5f6c1ce9673b..b517257a6315 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -78,12 +78,6 @@ #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) -#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ - KVM_BUS_LOCK_DETECTION_EXIT) - -#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ - KVM_X86_NOTIFY_VMEXIT_USER) - /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -161,12 +155,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50 -#define KVM_MIN_ALLOC_MMU_PAGES 64UL -#define KVM_MMU_HASH_SHIFT 12 -#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) -#define KVM_MIN_FREE_MMU_PAGES 5 -#define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_VAR_MTRR 8 @@ -315,6 +303,53 @@ enum x86_intercept_stage; struct kvm_kernel_irqfd; struct kvm_kernel_irq_routing_entry; +struct kvm_x86_msr_filter; +struct kvm_x86_pmu_event_filter; + +struct kvm_caps { + /* control of guest tsc rate supported? */ + bool has_tsc_control; + /* maximum supported tsc_khz for guests */ + u32 max_guest_tsc_khz; + /* number of bits of the fractional part of the TSC scaling ratio */ + u8 tsc_scaling_ratio_frac_bits; + /* maximum allowed value of TSC scaling ratio */ + u64 max_tsc_scaling_ratio; + /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ + u64 default_tsc_scaling_ratio; + /* bus lock detection supported? */ + bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; + /* bit mask of VM types */ + u32 supported_vm_types; + + u64 supported_mce_cap; + u64 supported_xcr0; + u64 supported_xss; + u64 supported_perf_cap; + + u64 supported_quirks; + u64 inapplicable_quirks; +}; +extern struct kvm_caps kvm_caps; + +struct kvm_host_values { + /* + * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical + * address bits irrespective of features that repurpose legal bits, + * e.g. MKTME. + */ + u8 maxphyaddr; + + u64 efer; + u64 xcr0; + u64 xss; + u64 s_cet; + u64 arch_capabilities; +}; +extern struct kvm_host_values kvm_host; + /* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in @@ -452,9 +487,24 @@ struct kvm_pio_request { #define PT64_ROOT_MAX_LEVEL 5 -struct rsvd_bits_validate { +struct kvm_page_format { u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL]; u64 bad_mt_xwr; + + /* + * The pkru_mask indicates if protection key checks are needed. It + * consists of 16 domains indexed by page fault error code bits [4:1], + * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. + * Each domain has 2 bits which are ANDed with AD and WD from PKRU. + */ + u32 pkru_mask; + + /* + * Bitmap; bit set = permission fault + * Array index: page fault error code [4:1] + * Bit index: pte permissions in ACC_* format + */ + u16 permissions[16]; }; struct kvm_mmu_root_info { @@ -478,43 +528,35 @@ struct kvm_page_fault; /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, - * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the + * and 2-level 32-bit). The kvm_pagewalk structure abstracts the details of the * current mmu mode. */ -struct kvm_mmu { +struct kvm_pagewalk { unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault, bool from_hardware); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); + + union kvm_cpu_role cpu_role; + struct kvm_page_format fmt; +}; + +struct kvm_mmu { + int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); int (*sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i); + struct kvm_pagewalk *w; + struct kvm_mmu_root_info root; hpa_t mirror_root_hpa; - union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; - /* - * The pkru_mask indicates if protection key checks are needed. It - * consists of 16 domains indexed by page fault error code bits [4:1], - * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. - * Each domain has 2 bits which are ANDed with AD and WD from PKRU. - */ - u32 pkru_mask; - struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; - /* - * Bitmap; bit set = permission fault - * Byte index: page fault error code [4:1] - * Bit index: pte permissions in ACC_* format - */ - u16 permissions[16]; - u64 *pae_root; u64 *pml4_root; u64 *pml5_root; @@ -524,8 +566,7 @@ struct kvm_mmu { * bits include not only hardware reserved bits but also * the bits spte never used. */ - struct rsvd_bits_validate shadow_zero_check; - struct rsvd_bits_validate guest_rsvd_check; + struct kvm_page_format fmt; }; enum pmc_type { @@ -866,24 +907,14 @@ struct kvm_vcpu_arch { /* Non-nested MMU for L1 */ struct kvm_mmu root_mmu; - /* L1 MMU when running nested */ + /* L1 TDP when running nested */ struct kvm_mmu guest_mmu; + struct kvm_pagewalk ngpa_walk; /* - * Paging state of an L2 guest (used for nested npt) - * - * This context will save all necessary information to walk page tables - * of an L2 guest. This context is only initialized for page table - * walking and not for faulting since we never handle l2 page faults on - * the host. - */ - struct kvm_mmu nested_mmu; - - /* - * Pointer to the mmu context currently used for - * gva_to_gpa translations. + * Pagewalk context used for gva_to_gpa translations. */ - struct kvm_mmu *walk_mmu; + struct kvm_pagewalk gva_walk; u64 pdptrs[4]; /* pae */ @@ -1246,13 +1277,6 @@ struct kvm_hv { }; #endif -struct msr_bitmap_range { - u32 flags; - u32 nmsrs; - u32 base; - unsigned long *bitmap; -}; - #ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { @@ -1283,132 +1307,6 @@ enum kvm_suppress_eoi_broadcast_mode { KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */ }; -struct kvm_x86_msr_filter { - u8 count; - bool default_allow:1; - struct msr_bitmap_range ranges[16]; -}; - -struct kvm_x86_pmu_event_filter { - __u32 action; - __u32 nevents; - __u32 fixed_counter_bitmap; - __u32 flags; - __u32 nr_includes; - __u32 nr_excludes; - __u64 *includes; - __u64 *excludes; - __u64 events[] __counted_by(nevents); -}; - -enum kvm_apicv_inhibit { - - /********************************************************************/ - /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */ - /********************************************************************/ - - /* - * APIC acceleration is disabled by a module parameter - * and/or not supported in hardware. - */ - APICV_INHIBIT_REASON_DISABLED, - - /* - * APIC acceleration is inhibited because AutoEOI feature is - * being used by a HyperV guest. - */ - APICV_INHIBIT_REASON_HYPERV, - - /* - * APIC acceleration is inhibited because the userspace didn't yet - * enable the kernel/split irqchip. - */ - APICV_INHIBIT_REASON_ABSENT, - - /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ - * (out of band, debug measure of blocking all interrupts on this vCPU) - * was enabled, to avoid AVIC/APICv bypassing it. - */ - APICV_INHIBIT_REASON_BLOCKIRQ, - - /* - * APICv is disabled because not all vCPUs have a 1:1 mapping between - * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. - */ - APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, - - /* - * For simplicity, the APIC acceleration is inhibited - * first time either APIC ID or APIC base are changed by the guest - * from their reset values. - */ - APICV_INHIBIT_REASON_APIC_ID_MODIFIED, - APICV_INHIBIT_REASON_APIC_BASE_MODIFIED, - - /******************************************************/ - /* INHIBITs that are relevant only to the AMD's AVIC. */ - /******************************************************/ - - /* - * AVIC is inhibited on a vCPU because it runs a nested guest. - * - * This is needed because unlike APICv, the peers of this vCPU - * cannot use the doorbell mechanism to signal interrupts via AVIC when - * a vCPU runs nested. - */ - APICV_INHIBIT_REASON_NESTED, - - /* - * On SVM, the wait for the IRQ window is implemented with pending vIRQ, - * which cannot be injected when the AVIC is enabled, thus AVIC - * is inhibited while KVM waits for IRQ window. - */ - APICV_INHIBIT_REASON_IRQWIN, - - /* - * PIT (i8254) 're-inject' mode, relies on EOI intercept, - * which AVIC doesn't support for edge triggered interrupts. - */ - APICV_INHIBIT_REASON_PIT_REINJ, - - /* - * AVIC is disabled because SEV doesn't support it. - */ - APICV_INHIBIT_REASON_SEV, - - /* - * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 - * mapping between logical ID and vCPU. - */ - APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, - - /* - * AVIC is disabled because the vCPU's APIC ID is beyond the max - * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable. - */ - APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG, - - NR_APICV_INHIBIT_REASONS, -}; - -#define __APICV_INHIBIT_REASON(reason) \ - { BIT(APICV_INHIBIT_REASON_##reason), #reason } - -#define APICV_INHIBIT_REASONS \ - __APICV_INHIBIT_REASON(DISABLED), \ - __APICV_INHIBIT_REASON(HYPERV), \ - __APICV_INHIBIT_REASON(ABSENT), \ - __APICV_INHIBIT_REASON(BLOCKIRQ), \ - __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \ - __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \ - __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \ - __APICV_INHIBIT_REASON(NESTED), \ - __APICV_INHIBIT_REASON(IRQWIN), \ - __APICV_INHIBIT_REASON(PIT_REINJ), \ - __APICV_INHIBIT_REASON(SEV), \ - __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ - __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) - struct kvm_possible_nx_huge_pages { /* * A list of kvm_mmu_page structs that, if zapped, could possibly be @@ -1771,11 +1669,6 @@ struct kvm_lapic_irq { bool msi_redir_hint; }; -static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) -{ - return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; -} - enum kvm_x86_run_flags { KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), KVM_RUN_LOAD_GUEST_DR6 = BIT(1), @@ -2054,7 +1947,6 @@ struct kvm_arch_async_pf { u64 error_code; }; -extern u32 __read_mostly kvm_nr_uret_msrs; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; extern bool __read_mostly enable_ipiv; @@ -2069,9 +1961,6 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> -int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); -void kvm_x86_vendor_exit(void); - #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -2114,304 +2003,12 @@ enum kvm_intr_type { ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) -void __init kvm_mmu_x86_module_init(void); -int kvm_mmu_vendor_module_init(void); -void kvm_mmu_vendor_module_exit(void); - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu); -int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); -void kvm_mmu_uninit_vm(struct kvm *kvm); - -void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, - struct kvm_memory_slot *slot); - -void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); -void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - const struct kvm_memory_slot *memslot, - int start_level); -void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, - const struct kvm_memory_slot *memslot, - int target_level); -void kvm_mmu_try_split_huge_pages(struct kvm *kvm, - const struct kvm_memory_slot *memslot, - u64 start, u64 end, - int target_level); -void kvm_mmu_recover_huge_pages(struct kvm *kvm, - const struct kvm_memory_slot *memslot); -void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - const struct kvm_memory_slot *memslot); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); - -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); - -extern bool tdp_enabled; - -/* - * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing - * userspace I/O) to indicate that the emulation context - * should be reused as is, i.e. skip initialization of - * emulation context, instruction fetch and decode. - * - * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. - * Indicates that only select instructions (tagged with - * EmulateOnUD) should be emulated (to minimize the emulator - * attack surface). See also EMULTYPE_TRAP_UD_FORCED. - * - * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to - * decode the instruction length. For use *only* by - * kvm_x86_ops.skip_emulated_instruction() implementations if - * EMULTYPE_COMPLETE_USER_EXIT is not set. - * - * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to - * retry native execution under certain conditions, - * Can only be set in conjunction with EMULTYPE_PF. - * - * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was - * triggered by KVM's magic "force emulation" prefix, - * which is opt in via module param (off by default). - * Bypasses EmulateOnUD restriction despite emulating - * due to an intercepted #UD (see EMULTYPE_TRAP_UD). - * Used to test the full emulator from userspace. - * - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware - * backdoor emulation, which is opt in via module param. - * VMware backdoor emulation handles select instructions - * and reinjects the #GP for all other cases. - * - * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case - * the CR2/GPA value pass on the stack is valid. - * - * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility - * state and inject single-step #DBs after skipping - * an instruction (after completing userspace I/O). - * - * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that - * is attempting to write a gfn that contains one or - * more of the PTEs used to translate the write itself, - * and the owning page table is being shadowed by KVM. - * If emulation of the faulting instruction fails and - * this flag is set, KVM will exit to userspace instead - * of retrying emulation as KVM cannot make forward - * progress. - * - * If emulation fails for a write to guest page tables, - * KVM unprotects (zaps) the shadow page for the target - * gfn and resumes the guest to retry the non-emulatable - * instruction (on hardware). Unprotecting the gfn - * doesn't allow forward progress for a self-changing - * access because doing so also zaps the translation for - * the gfn, i.e. retrying the instruction will hit a - * !PRESENT fault, which results in a new shadow page - * and sends KVM back to square one. - * - * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip - * an instruction if it could generate a given software - * interrupt, which must be encoded via - * EMULTYPE_SET_SOFT_INT_VECTOR(). - */ -#define EMULTYPE_NO_DECODE (1 << 0) -#define EMULTYPE_TRAP_UD (1 << 1) -#define EMULTYPE_SKIP (1 << 2) -#define EMULTYPE_ALLOW_RETRY_PF (1 << 3) -#define EMULTYPE_TRAP_UD_FORCED (1 << 4) -#define EMULTYPE_VMWARE_GP (1 << 5) -#define EMULTYPE_PF (1 << 6) -#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) -#define EMULTYPE_WRITE_PF_TO_SP (1 << 8) -#define EMULTYPE_SKIP_SOFT_INT (1 << 9) - -#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16) -#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff) - -static inline bool kvm_can_emulate_event_vectoring(int emul_type) -{ - return !(emul_type & EMULTYPE_PF); -} - -int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); -int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, - void *insn, int insn_len); -void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, - u64 *data, u8 ndata); -void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); - -void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); -void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason); - -void kvm_enable_efer_bits(u64); -bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); -int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); -int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); -int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); -int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); -int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); -int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); -int kvm_emulate_invd(struct kvm_vcpu *vcpu); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu); -int kvm_handle_invalid_op(struct kvm_vcpu *vcpu); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu); - -int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); -int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); -int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu); -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); - -void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); - -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, - int reason, bool has_error_code, u32 error_code); - -void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); -void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); -int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); - -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); - -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, - bool has_error_code, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault, - bool from_hardware); -void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault, - bool from_hardware); - -static inline void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - __kvm_inject_emulated_page_fault(vcpu, fault, false); -} - -bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); - -static inline int __kvm_irq_line_state(unsigned long *irq_state, - int irq_source_id, int level) -{ - /* Logical OR for level trig interrupt */ - if (level) - __set_bit(irq_source_id, irq_state); - else - __clear_bit(irq_source_id, irq_state); - - return !!(*irq_state); -} - -void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); - -void kvm_update_dr7(struct kvm_vcpu *vcpu); - -bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - bool always_retry); - -static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, - gpa_t cr2_or_gpa) -{ - return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); -} - -void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, - ulong roots_to_free); -void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); - -bool kvm_apicv_activated(struct kvm *kvm); -bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); -void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); -void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, - enum kvm_apicv_inhibit reason, bool set); -void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, - enum kvm_apicv_inhibit reason, bool set); - -static inline void kvm_set_apicv_inhibit(struct kvm *kvm, - enum kvm_apicv_inhibit reason) -{ - kvm_set_or_clear_apicv_inhibit(kvm, reason, true); -} - -static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, - enum kvm_apicv_inhibit reason) -{ - kvm_set_or_clear_apicv_inhibit(kvm, reason, false); -} - -void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc); - -static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm) -{ - kvm_inc_or_dec_irq_window_inhibit(kvm, true); -} - -static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm) -{ - kvm_inc_or_dec_irq_window_inhibit(kvm, false); -} - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, - void *insn, int insn_len); -void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); -void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - u64 addr, unsigned long roots); -void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); - -void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, - int tdp_max_root_level, int tdp_huge_page_level); - - #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) -static inline u16 kvm_read_ldt(void) -{ - u16 ldt; - asm("sldt %0" : "=g"(ldt)); - return ldt; -} - -static inline void kvm_load_ldt(u16 sel) -{ - asm("lldt %0" : : "rm"(sel)); -} - #ifdef CONFIG_X86_64 static inline unsigned long read_msr(unsigned long msr) { @@ -2422,18 +2019,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) -{ - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); -} - -#define TSS_IOPB_BASE_OFFSET 0x66 -#define TSS_BASE_SIZE 0x68 -#define TSS_IOPB_SIZE (65536 / 8) -#define TSS_REDIRECTION_SIZE (256 / 8) -#define RMODE_TSS_SIZE \ - (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) - enum { TASK_SWITCH_CALL = 0, TASK_SWITCH_IRET = 1, @@ -2456,40 +2041,6 @@ enum { # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif -int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); -int kvm_cpu_has_extint(struct kvm_vcpu *v); -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); -int kvm_cpu_get_extint(struct kvm_vcpu *v); -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); - -int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, u32 min, - unsigned long icr, int op_64_bit); - -int kvm_add_user_return_msr(u32 msr); -int kvm_find_user_return_msr(u32 msr); -int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); -u64 kvm_get_user_return_msr(unsigned int slot); - -static inline bool kvm_is_supported_user_return_msr(u32 msr) -{ - return kvm_find_user_return_msr(msr) >= 0; -} - -u64 kvm_scale_tsc(u64 tsc, u64 ratio); -u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); -u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); -u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); - -unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); - -void kvm_make_scan_ioapic_request(struct kvm *kvm); -void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, - unsigned long *vcpu_bitmap); - bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, @@ -2498,22 +2049,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); -extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); - -int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); -int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); - -void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, - u32 size); -bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); -bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); - -static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) -{ - /* We can only post Fixed and LowPrio IRQs */ - return (irq->delivery_mode == APIC_DM_FIXED || - irq->delivery_mode == APIC_DM_LOWEST); -} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { @@ -2525,36 +2060,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) kvm_x86_call(vcpu_unblocking)(vcpu); } -int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); - -#define KVM_CLOCK_VALID_FLAGS \ - (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) - -#define KVM_X86_VALID_QUIRKS \ - (KVM_X86_QUIRK_LINT0_REENABLED | \ - KVM_X86_QUIRK_CD_NW_CLEARED | \ - KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ - KVM_X86_QUIRK_OUT_7E_INC_RIP | \ - KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ - KVM_X86_QUIRK_SLOT_ZAP_ALL | \ - KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ - KVM_X86_QUIRK_IGNORE_GUEST_PAT | \ - KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \ - KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT) - -#define KVM_X86_CONDITIONAL_QUIRKS \ - (KVM_X86_QUIRK_CD_NW_CLEARED | \ - KVM_X86_QUIRK_IGNORE_GUEST_PAT) - -/* - * KVM previously used a u32 field in kvm_run to indicate the hypercall was - * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the - * remaining 31 lower bits must be 0 to preserve ABI. - */ -#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) - static inline bool kvm_arch_has_irq_bypass(void) { return enable_device_posted_irqs; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 77337c37324b..0474604ab8a1 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,8 +5,8 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror include $(srctree)/virt/kvm/Makefile.kvm -kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \ - debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o +kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o msrs.o pmu.o regs.o \ + mtrr.o debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 591d2294acd7..2698fa42cd97 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,6 +28,7 @@ #include "trace.h" #include "pmu.h" #include "xen.h" +#include "x86.h" /* * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h index f898781b6a06..6b7b628f530d 100644 --- a/arch/x86/kvm/fpu.h +++ b/arch/x86/kvm/fpu.h @@ -3,8 +3,34 @@ #ifndef __KVM_FPU_H_ #define __KVM_FPU_H_ +#include <linux/kvm_host.h> + +#include <trace/events/kvm.h> + #include <asm/fpu/api.h> +/* Swap (qemu) user FPU context for the guest FPU context. */ +static inline void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); + trace_kvm_fpu(1); +} + +/* When vcpu_run ends, restore user space FPU context. */ +static inline void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + typedef u32 __attribute__((vector_size(16))) sse128_t; #define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } #define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; }) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index fd4eb1e561f7..1ee0d23f8949 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2045,10 +2045,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ - if (!hc->fast && mmu_is_nested(vcpu)) { - hc->ingpa = kvm_x86_ops.nested_ops->translate_nested_gpa( - vcpu, hc->ingpa, - PFERR_GUEST_FINAL_MASK, NULL, 0); + if (!hc->fast) { + hc->ingpa = kvm_translate_gpa(vcpu, &vcpu->arch.gva_walk, hc->ingpa, + PFERR_GUEST_FINAL_MASK, NULL, 0); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 65e89ed65349..1c8f7aaab063 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -22,7 +22,8 @@ #define __ARCH_X86_KVM_HYPERV_H__ #include <linux/kvm_host.h> -#include "x86.h" + +#include "regs.h" #ifdef CONFIG_KVM_HYPERV diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 757667fb2bfa..0d59b9c758c2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -33,6 +33,7 @@ #include "lapic.h" #include "irq.h" #include "trace.h" +#include "x86.h" static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3dadae093690..81b576513116 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -113,6 +113,18 @@ void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); + +static inline int __kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + __set_bit(irq_source_id, irq_state); + else + __clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} #endif /* CONFIG_KVM_IOAPIC */ static inline int ioapic_in_kernel(struct kvm *kvm) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 8c62c6d4d5c1..727245a6ab34 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -423,6 +423,13 @@ void kvm_arch_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +static bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) +{ + /* We can only post Fixed and LowPrio IRQs */ + return (irq->delivery_mode == APIC_DM_FIXED || + irq->delivery_mode == APIC_DM_LOWEST); +} + static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *entry) { diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 34f4a78a7a01..1a84ea31e7fd 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -112,6 +112,12 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return mode != KVM_IRQCHIP_NONE; } +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_cpu_has_extint(struct kvm_vcpu *v); +int kvm_cpu_get_extint(struct kvm_vcpu *v); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); + void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 71970213dc1f..58dbb94f980d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -131,6 +131,9 @@ static inline int kvm_irq_delivery_to_apic(struct kvm *kvm, } void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, u32 min, + unsigned long icr, int op_64_bit); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); @@ -237,6 +240,11 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } +static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) +{ + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e1bb663ebbd5..c9f628b97dae 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -4,10 +4,23 @@ #include <linux/kvm_host.h> #include "regs.h" -#include "x86.h" #include "cpuid.h" +extern bool tdp_enabled; +#ifdef CONFIG_X86_64 +extern bool tdp_mmu_enabled; +#else +#define tdp_mmu_enabled false +#endif extern bool __read_mostly enable_mmio_caching; +extern bool eager_page_split; + +#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50 +#define KVM_MIN_ALLOC_MMU_PAGES 64UL +#define KVM_MMU_HASH_SHIFT 12 +#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) +#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_REFILL_PAGES 25 #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 @@ -90,6 +103,38 @@ static inline bool mmu_has_mbec(struct kvm_mmu *mmu) u8 kvm_mmu_get_max_tdp_level(void); +void __init kvm_mmu_x86_module_init(void); +int kvm_mmu_vendor_module_init(void); +void kvm_mmu_vendor_module_exit(void); + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu); +int kvm_mmu_create(struct kvm_vcpu *vcpu); +int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_uninit_vm(struct kvm *kvm); + +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + +void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int start_level); +void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int target_level); +void kvm_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + u64 start, u64 end, + int target_level); +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot); +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + const struct kvm_memory_slot *memslot); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); + void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); @@ -101,11 +146,24 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, int huge_page_level, bool accessed_dirty, bool mbec, gpa_t new_eptp); + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, + void *insn, int insn_len); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, + u64 addr, unsigned long roots); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); + +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); + bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu); + struct kvm_pagewalk *pw); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); @@ -115,6 +173,25 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + +void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, + ulong roots_to_free); +void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception); + static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) @@ -169,21 +246,21 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) } static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu) + struct kvm_pagewalk *w) { /* * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e. - * @mmu's snapshot of CR0.WP and thus all related paging metadata may + * @w's snapshot of CR0.WP and thus all related paging metadata may * be stale. Refresh CR0.WP and the metadata on-demand when checking * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing - * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does - * need to refresh nested_mmu, a.k.a. the walker used to translate L2 - * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP. + * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM will + * still refresh gva_walk, so as to honor L2's CR0.WP when translating + * L2 GVAs to GPAs. */ - if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu) + if (!tdp_enabled || w == &vcpu->arch.ngpa_walk) return; - __kvm_mmu_refresh_passthrough_bits(vcpu, mmu); + __kvm_mmu_refresh_passthrough_bits(vcpu, w); } /* @@ -194,7 +271,7 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, * Return zero if the access does not fault; return the page fault error code * if the access faults. */ -static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, +static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, unsigned pte_access, unsigned pte_pkey, u64 access) { @@ -217,15 +294,16 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; + struct kvm_page_format *fmt = &w->fmt; u32 errcode = PFERR_PRESENT_MASK; bool fault; - kvm_mmu_refresh_passthrough_bits(vcpu, mmu); + kvm_mmu_refresh_passthrough_bits(vcpu, w); - fault = (mmu->permissions[index] >> pte_access) & 1; + fault = (fmt->permissions[index] >> pte_access) & 1; WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK)); - if (unlikely(mmu->pkru_mask)) { + if (unlikely(fmt->pkru_mask)) { u32 pkru_bits, offset; /* @@ -239,7 +317,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); - pkru_bits &= mmu->pkru_mask >> offset; + pkru_bits &= fmt->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; fault |= (pkru_bits != 0); } @@ -261,12 +339,6 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm) return smp_load_acquire(&kvm->arch.shadow_root_allocated); } -#ifdef CONFIG_X86_64 -extern bool tdp_mmu_enabled; -#else -#define tdp_mmu_enabled false -#endif - int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) @@ -300,13 +372,18 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) atomic64_add(count, &kvm->stat.pages[level - 1]); } +static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, + struct kvm_pagewalk *w, gpa_t gpa, u64 access, struct x86_exception *exception, u64 pte_access) { - if (mmu != &vcpu->arch.nested_mmu) + if (!mmu_is_nested(vcpu) || w == &vcpu->arch.ngpa_walk) return gpa; return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access, exception, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 234d0a95abf5..dda1fd266d33 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -115,6 +115,9 @@ module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled); #endif +bool __read_mostly eager_page_split = true; +module_param(eager_page_split, bool, 0644); + static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; @@ -225,9 +228,9 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ -static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ +static inline bool __maybe_unused is_##reg##_##name(struct kvm_pagewalk *w) \ { \ - return !!(mmu->cpu_role. base_or_ext . reg##_##name); \ + return !!(w->cpu_role. base_or_ext . reg##_##name); \ } BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); @@ -238,19 +241,19 @@ BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); -static inline bool has_pferr_fetch(struct kvm_mmu *mmu) +static inline bool has_pferr_fetch(struct kvm_pagewalk *w) { - return mmu->cpu_role.ext.has_pferr_fetch; + return w->cpu_role.ext.has_pferr_fetch; } -static inline bool is_cr0_pg(struct kvm_mmu *mmu) +static inline bool is_cr0_pg(struct kvm_pagewalk *w) { - return mmu->cpu_role.base.level > 0; + return w->cpu_role.base.level > 0; } -static inline bool is_cr4_pae(struct kvm_mmu *mmu) +static inline bool is_cr4_pae(struct kvm_pagewalk *w) { - return !mmu->cpu_role.base.has_4_byte_gpte; + return !w->cpu_role.base.has_4_byte_gpte; } static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) @@ -270,12 +273,12 @@ static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) } static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu) + struct kvm_pagewalk *w) { - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && w->get_guest_pgd == get_guest_cr3) return kvm_read_cr3(vcpu); - return mmu->get_guest_pgd(vcpu); + return w->get_guest_pgd(vcpu); } static inline bool kvm_available_flush_remote_tlbs_range(void) @@ -2476,12 +2479,14 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato struct kvm_vcpu *vcpu, hpa_t root, u64 addr) { + struct kvm_pagewalk *w = vcpu->arch.mmu->w; + iterator->addr = addr; iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->root_role.level; if (iterator->level >= PT64_ROOT_4LEVEL && - vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL && + w->cpu_role.base.level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->root_role.direct) iterator->level = PT32E_ROOT_LEVEL; @@ -3690,6 +3695,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) */ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { + struct kvm_mmu *mmu; struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte; @@ -3699,6 +3705,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; + mmu = vcpu->arch.mmu; walk_shadow_page_lockless_begin(vcpu); do { @@ -3734,7 +3741,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_access_allowed(fault, spte)) { + if (!spte_permission_fault(mmu, spte, fault)) { ret = RET_PF_SPURIOUS; break; } @@ -3757,7 +3764,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * that were write-protected for dirty-logging or access * tracking are handled here. Don't bother checking if the * SPTE is writable to prioritize running with A/D bits enabled. - * The is_access_allowed() check above handles the common case + * The spte_permission_fault() check above handles the common case * of the fault being spurious, and the SPTE is known to be * shadow-present, i.e. except for access tracking restoration * making the new SPTE writable, the check is wasteful. @@ -3782,7 +3789,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(fault, new_spte)) + spte_permission_fault(mmu, new_spte, fault)) break; /* @@ -4088,12 +4095,13 @@ out_unlock: static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_pagewalk *w = mmu->w; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; int quadrant, i, r; hpa_t root; - root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); + root_pgd = kvm_mmu_get_guest_pgd(vcpu, w); root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { @@ -4105,9 +4113,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * On SVM, reading PDPTRs might access guest memory, which might fault * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. */ - if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { + if (w->cpu_role.base.level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - pdptrs[i] = mmu->get_pdptr(vcpu, i); + pdptrs[i] = w->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; @@ -4129,7 +4137,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { + if (w->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->root_role.level); mmu->root.hpa = root; @@ -4168,7 +4176,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { + if (w->cpu_role.base.level == PT32E_ROOT_LEVEL) { if (!(pdptrs[i] & PT_PRESENT_MASK)) { mmu->pae_root[i] = INVALID_PAE_ROOT; continue; @@ -4182,7 +4190,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * directory. Othwerise each PAE page direct shadows one guest * PAE page directory so that quadrant should be 0. */ - quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + quadrant = (w->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; @@ -4206,6 +4214,7 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_pagewalk *w = mmu->w; bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL; u64 *pml5_root = NULL; u64 *pml4_root = NULL; @@ -4218,7 +4227,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ if (mmu->root_role.direct || - mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL || + w->cpu_role.base.level >= PT64_ROOT_4LEVEL || mmu->root_role.level < PT64_ROOT_4LEVEL) return 0; @@ -4323,7 +4332,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->w->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; if (!is_unsync_root(root)) @@ -4364,7 +4373,7 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, gpa_t vaddr, u64 access, struct x86_exception *exception) { @@ -4376,7 +4385,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * user-mode address if CR0.PG=0. Therefore *include* ACC_USER_MASK in * the last argument to kvm_translate_gpa (which NPT does not use). */ - return kvm_translate_gpa(vcpu, mmu, vaddr, access | PFERR_GUEST_FINAL_MASK, + return kvm_translate_gpa(vcpu, w, vaddr, access | PFERR_GUEST_FINAL_MASK, exception, ACC_ALL); } @@ -4440,7 +4449,7 @@ static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; - struct rsvd_bits_validate *rsvd_check; + struct kvm_page_format *rsvd_check; int root, leaf, level; bool reserved = false; @@ -4461,7 +4470,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(sptes[leaf])) leaf++; - rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + rsvd_check = &vcpu->arch.mmu->fmt; for (level = root; level >= leaf; level--) reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); @@ -4565,43 +4574,12 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, if (arch.direct_map) arch.cr3 = (unsigned long)INVALID_GPA; else - arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu->w); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) - return; - - if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || - work->wakeup_all) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu->root_role.direct && - work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) - return; - - r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, - true, NULL, NULL); - - /* - * Account fixed page faults, otherwise they'll never be counted, but - * ignore stats for all other return times. Page-ready "faults" aren't - * truly spurious and never trigger emulation - */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; -} - static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { @@ -4958,7 +4936,7 @@ out_unlock: } #endif -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) @@ -4968,6 +4946,71 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } +static int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 err, bool prefetch, int *emulation_type, + u8 *level) +{ + struct kvm_page_fault fault = { + .addr = cr2_or_gpa, + .error_code = err, + .exec = err & PFERR_FETCH_MASK, + .write = err & PFERR_WRITE_MASK, + .present = err & PFERR_PRESENT_MASK, + .rsvd = err & PFERR_RSVD_MASK, + .user = err & PFERR_USER_MASK, + .prefetch = prefetch, + .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + .is_private = err & PFERR_PRIVATE_ACCESS, + + .pfn = KVM_PFN_ERR_FAULT, + }; + int r; + + if (vcpu->arch.mmu->root_role.direct) { + /* + * Things like memslots don't understand the concept of a shared + * bit. Strip it so that the GFN can be used like normal, and the + * fault.addr can be used when the shared bit is needed. + */ + fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm); + fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); + } + + /* + * With retpoline being active an indirect call is rather expensive, + * so do a direct call in the most common case. + */ + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) + r = kvm_tdp_page_fault(vcpu, &fault); + else + r = vcpu->arch.mmu->page_fault(vcpu, &fault); + + /* + * Not sure what's happening, but punt to userspace and hope that + * they can fix it by changing memory to shared, or they can + * provide a better error. + */ + if (r == RET_PF_EMULATE && fault.is_private) { + pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); + kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); + return -EFAULT; + } + + if (fault.write_fault_to_shadow_pgtable && emulation_type) + *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; + + return r; +} + + static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { @@ -5058,6 +5101,37 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, return min(range->size, end - range->gpa); } +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) +{ + int r; + + if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) + return; + + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || + work->wakeup_all) + return; + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return; + + if (!vcpu->arch.mmu->root_role.direct && + work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu->w)) + return; + + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; +} + #ifdef CONFIG_KVM_GUEST_MEMFD static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot) { @@ -5141,7 +5215,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn); static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; - context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_spte = NULL; } @@ -5317,7 +5390,7 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, #include "paging_tmpl.h" #undef PTTYPE -static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, +static void __reset_rsvds_bits_mask(struct kvm_page_format *fmt, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { @@ -5325,7 +5398,7 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 nonleaf_bit8_rsvd = 0; u64 high_bits_rsvd; - rsvd_check->bad_mt_xwr = 0; + fmt->bad_mt_xwr = 0; if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); @@ -5349,75 +5422,75 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ - rsvd_check->rsvd_bits_mask[0][1] = 0; - rsvd_check->rsvd_bits_mask[0][0] = 0; - rsvd_check->rsvd_bits_mask[1][0] = - rsvd_check->rsvd_bits_mask[0][0]; + fmt->rsvd_bits_mask[0][1] = 0; + fmt->rsvd_bits_mask[0][0] = 0; + fmt->rsvd_bits_mask[1][0] = + fmt->rsvd_bits_mask[0][0]; if (!pse) { - rsvd_check->rsvd_bits_mask[1][1] = 0; + fmt->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ - rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + fmt->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ - rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + fmt->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | + fmt->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | high_bits_rsvd | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ - rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ - rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + fmt->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ + fmt->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ + fmt->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ - rsvd_check->rsvd_bits_mask[1][0] = - rsvd_check->rsvd_bits_mask[0][0]; + fmt->rsvd_bits_mask[1][0] = + fmt->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: - rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | + fmt->rsvd_bits_mask[0][4] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); - rsvd_check->rsvd_bits_mask[1][4] = - rsvd_check->rsvd_bits_mask[0][4]; + fmt->rsvd_bits_mask[1][4] = + fmt->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: - rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | + fmt->rsvd_bits_mask[0][3] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); - rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | + fmt->rsvd_bits_mask[0][2] = high_bits_rsvd | gbpages_bit_rsvd; - rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; - rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; - rsvd_check->rsvd_bits_mask[1][3] = - rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | + fmt->rsvd_bits_mask[0][1] = high_bits_rsvd; + fmt->rsvd_bits_mask[0][0] = high_bits_rsvd; + fmt->rsvd_bits_mask[1][3] = + fmt->rsvd_bits_mask[0][3]; + fmt->rsvd_bits_mask[1][2] = high_bits_rsvd | gbpages_bit_rsvd | rsvd_bits(13, 29); - rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | + fmt->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ - rsvd_check->rsvd_bits_mask[1][0] = - rsvd_check->rsvd_bits_mask[0][0]; + fmt->rsvd_bits_mask[1][0] = + fmt->rsvd_bits_mask[0][0]; break; } } static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) + struct kvm_pagewalk *w) { - __reset_rsvds_bits_mask(&context->guest_rsvd_check, + __reset_rsvds_bits_mask(&w->fmt, vcpu->arch.reserved_gpa_bits, - context->cpu_role.base.level, is_efer_nx(context), + w->cpu_role.base.level, is_efer_nx(w), guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), - is_cr4_pse(context), + is_cr4_pse(w), guest_cpuid_is_amd_compatible(vcpu)); } -static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, +static void __reset_rsvds_bits_mask_ept(struct kvm_page_format *fmt, u64 pa_bits_rsvd, bool execonly, int huge_page_level) { @@ -5430,18 +5503,18 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, if (huge_page_level < PG_LEVEL_2M) large_2m_rsvd = rsvd_bits(7, 7); - rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; - rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; - rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; + fmt->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); + fmt->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); + fmt->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; + fmt->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; + fmt->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ - rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; - rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; - rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; - rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; + fmt->rsvd_bits_mask[1][4] = fmt->rsvd_bits_mask[0][4]; + fmt->rsvd_bits_mask[1][3] = fmt->rsvd_bits_mask[0][3]; + fmt->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; + fmt->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; + fmt->rsvd_bits_mask[1][0] = fmt->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ @@ -5452,13 +5525,13 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, /* bits 0..2 must not be 100 unless VMX capabilities allow it */ bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } - rsvd_check->bad_mt_xwr = bad_mt_xwr; + fmt->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly, int huge_page_level) + bool execonly, int huge_page_level) { - __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, + __reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.fmt, vcpu->arch.reserved_gpa_bits, execonly, huge_page_level); } @@ -5480,13 +5553,13 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, bool is_amd = true; /* KVM doesn't use 2-level page tables for the shadow MMU. */ bool is_pse = false; - struct rsvd_bits_validate *shadow_zero_check; + struct kvm_page_format *fmt; int i; WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL); - shadow_zero_check = &context->shadow_zero_check; - __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), + fmt = &context->fmt; + __reset_rsvds_bits_mask(fmt, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), @@ -5502,10 +5575,10 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, * Bits in shadow_me_mask but not in shadow_me_value are * not allowed to be set. */ - shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask; - shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask; - shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value; - shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value; + fmt->rsvd_bits_mask[0][i] |= shadow_me_mask; + fmt->rsvd_bits_mask[1][i] |= shadow_me_mask; + fmt->rsvd_bits_mask[0][i] &= ~shadow_me_value; + fmt->rsvd_bits_mask[1][i] &= ~shadow_me_value; } } @@ -5522,18 +5595,18 @@ static inline bool boot_cpu_is_amd(void) */ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { - struct rsvd_bits_validate *shadow_zero_check; + struct kvm_page_format *fmt; int i; - shadow_zero_check = &context->shadow_zero_check; + fmt = &context->fmt; if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), + __reset_rsvds_bits_mask(fmt, reserved_hpa_bits(), context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else - __reset_rsvds_bits_mask_ept(shadow_zero_check, + __reset_rsvds_bits_mask_ept(fmt, reserved_hpa_bits(), false, max_huge_page_level); @@ -5541,8 +5614,8 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) return; for (i = context->root_role.level; --i >= 0;) { - shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; - shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + fmt->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + fmt->rsvd_bits_mask[1][i] &= ~shadow_me_mask; } } @@ -5553,7 +5626,7 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) static void reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + __reset_rsvds_bits_mask_ept(&context->fmt, reserved_hpa_bits(), execonly, max_huge_page_level); } @@ -5588,18 +5661,15 @@ reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) (14 & (access) ? 1 << 14 : 0) | \ (15 & (access) ? 1 << 15 : 0)) -static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept) +static void __update_permission_bitmask(struct kvm_page_format *fmt, bool tdp, + bool ept, bool cr4_smep, bool cr4_smap, + bool cr0_wp, bool efer_nx) { unsigned index; const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK); const u16 r = ACC_BITS_MASK(ACC_READ_MASK); - bool cr4_smep = is_cr4_smep(mmu); - bool cr4_smap = is_cr4_smap(mmu); - bool cr0_wp = is_cr0_wp(mmu); - bool efer_nx = is_efer_nx(mmu); - /* * In hardware, page fault error codes are generated (as the name * suggests) on any kind of page fault. permission_fault() and @@ -5612,7 +5682,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept) * permission_fault() to indicate accesses that are *not* subject to * SMAP restrictions. */ - for (index = 0; index < ARRAY_SIZE(mmu->permissions); ++index) { + for (index = 0; index < ARRAY_SIZE(fmt->permissions); ++index) { unsigned pfec = index << 1; /* @@ -5686,10 +5756,23 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept) smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; } - mmu->permissions[index] = ff | uf | wf | rf | smapf; + fmt->permissions[index] = ff | uf | wf | rf | smapf; } } +static void update_permission_bitmask(struct kvm_pagewalk *w, bool tdp, bool ept) +{ + __update_permission_bitmask(&w->fmt, tdp, ept, + is_cr4_smep(w), is_cr4_smap(w), + is_cr0_wp(w), is_efer_nx(w)); +} + +static void update_spte_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept) +{ + __update_permission_bitmask(&mmu->fmt, tdp, ept, + mmu->root_role.cr4_smep, false, true, true); +} + /* * PKU is an additional mechanism by which the paging controls access to * user-mode addresses based on the value in the PKRU register. Protection @@ -5714,19 +5797,19 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept) * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ -static void update_pkru_bitmask(struct kvm_mmu *mmu) +static void update_pkru_bitmask(struct kvm_pagewalk *w) { unsigned bit; bool wp; - mmu->pkru_mask = 0; + w->fmt.pkru_mask = 0; - if (!is_cr4_pke(mmu)) + if (!is_cr4_pke(w)) return; - wp = is_cr0_wp(mmu); + wp = is_cr0_wp(w); - for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { + for (bit = 0; bit < ARRAY_SIZE(w->fmt.permissions); ++bit) { unsigned pfec, pkey_bits; bool check_pkey, check_write, ff, uf, wf, pte_user; @@ -5754,32 +5837,30 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) /* PKRU.WD stops write access. */ pkey_bits |= (!!check_write) << 1; - mmu->pkru_mask |= (pkey_bits & 3) << pfec; + w->fmt.pkru_mask |= (pkey_bits & 3) << pfec; } } static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu) + struct kvm_pagewalk *w) { - if (!is_cr0_pg(mmu)) + if (!is_cr0_pg(w)) return; - reset_guest_rsvds_bits_mask(vcpu, mmu); - update_permission_bitmask(mmu, mmu == &vcpu->arch.guest_mmu, false); - update_pkru_bitmask(mmu); + reset_guest_rsvds_bits_mask(vcpu, w); + update_permission_bitmask(w, w == &vcpu->arch.ngpa_walk, false); + update_pkru_bitmask(w); } static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; - context->gva_to_gpa = paging64_gva_to_gpa; context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; - context->gva_to_gpa = paging32_gva_to_gpa; context->sync_spte = paging32_sync_spte; } @@ -5825,18 +5906,18 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, } void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu) + struct kvm_pagewalk *w) { const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); - if (is_cr0_wp(mmu) == cr0_wp) + if (is_cr0_wp(w) == cr0_wp) return; - mmu->cpu_role.base.cr0_wp = cr0_wp; - reset_guest_paging_metadata(vcpu, mmu); + w->cpu_role.base.cr0_wp = cr0_wp; + reset_guest_paging_metadata(vcpu, w); } static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) @@ -5894,52 +5975,37 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role); - if (cpu_role.as_u64 == context->cpu_role.as_u64 && - root_role.word == context->root_role.word) + if (root_role.word == context->root_role.word) return; - context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; context->sync_spte = NULL; - context->get_guest_pgd = get_guest_cr3; - context->get_pdptr = kvm_pdptr_read; - context->inject_page_fault = kvm_inject_page_fault; - - if (!is_cr0_pg(context)) - context->gva_to_gpa = nonpaging_gva_to_gpa; - else if (is_cr4_pae(context)) - context->gva_to_gpa = paging64_gva_to_gpa; - else - context->gva_to_gpa = paging32_gva_to_gpa; - reset_guest_paging_metadata(vcpu, context); + update_spte_permission_bitmask(context, true, shadow_xs_mask); reset_tdp_shadow_zero_bits_mask(context); } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - union kvm_cpu_role cpu_role, union kvm_mmu_page_role root_role) { - if (cpu_role.as_u64 == context->cpu_role.as_u64 && - root_role.word == context->root_role.word) + if (root_role.word == context->root_role.word) return; - context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; - if (!is_cr0_pg(context)) + if (!is_cr0_pg(context->w)) nonpaging_init_context(context); - else if (is_cr4_pae(context)) + else if (is_cr4_pae(context->w)) paging64_init_context(context); else paging32_init_context(context); - reset_guest_paging_metadata(vcpu, context); + update_spte_permission_bitmask(context, context == &vcpu->arch.guest_mmu, false); reset_shadow_zero_bits_mask(vcpu, context); } -static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, +static void init_kvm_shadow_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; @@ -5961,7 +6027,28 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, */ root_role.efer_nx = true; - shadow_mmu_init_context(vcpu, context, cpu_role, root_role); + shadow_mmu_init_context(vcpu, context, root_role); +} + +static void init_kvm_page_walk(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, + union kvm_cpu_role cpu_role) +{ + if (cpu_role.as_u64 == w->cpu_role.as_u64) + return; + + w->cpu_role.as_u64 = cpu_role.as_u64; + w->inject_page_fault = kvm_inject_page_fault; + w->get_pdptr = kvm_pdptr_read; + w->get_guest_pgd = get_guest_cr3; + + if (!is_cr0_pg(w)) + w->gva_to_gpa = nonpaging_gva_to_gpa; + else if (is_cr4_pae(w)) + w->gva_to_gpa = paging64_gva_to_gpa; + else + w->gva_to_gpa = paging32_gva_to_gpa; + + reset_guest_paging_metadata(vcpu, w); } void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4, @@ -5980,13 +6067,15 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4, WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); cpu_role.base.cr4_smep = (misc_ctl & SVM_MISC_ENABLE_GMET) != 0; + init_kvm_page_walk(vcpu, &vcpu->arch.ngpa_walk, cpu_role); + root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); if (root_role.level == PT64_ROOT_5LEVEL && cpu_role.base.level == PT64_ROOT_4LEVEL) root_role.passthrough = 1; - shadow_mmu_init_context(vcpu, context, cpu_role, root_role); + shadow_mmu_init_context(vcpu, context, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu); @@ -6027,18 +6116,22 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level, mbec); - if (new_mode.as_u64 != context->cpu_role.as_u64) { + struct kvm_pagewalk *ngpa_walk = &vcpu->arch.ngpa_walk; + + if (new_mode.as_u64 != ngpa_walk->cpu_role.as_u64) { /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ - context->cpu_role.as_u64 = new_mode.as_u64; + ngpa_walk->cpu_role.as_u64 = new_mode.as_u64; context->root_role.word = new_mode.base.word; context->page_fault = ept_page_fault; - context->gva_to_gpa = ept_gva_to_gpa; + ngpa_walk->gva_to_gpa = ept_gva_to_gpa; context->sync_spte = ept_sync_spte; - update_permission_bitmask(context, true, true); - context->pkru_mask = 0; - reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); + update_permission_bitmask(ngpa_walk, true, true); + ngpa_walk->fmt.pkru_mask = 0; + reset_rsvds_bits_mask_ept(vcpu, execonly, huge_page_level); + + update_spte_permission_bitmask(context, true, true); reset_ept_shadow_zero_bits_mask(context, execonly); } @@ -6046,68 +6139,19 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu); -static void init_kvm_softmmu(struct kvm_vcpu *vcpu, - union kvm_cpu_role cpu_role) -{ - struct kvm_mmu *context = &vcpu->arch.root_mmu; - - kvm_init_shadow_mmu(vcpu, cpu_role); - - context->get_guest_pgd = get_guest_cr3; - context->get_pdptr = kvm_pdptr_read; - context->inject_page_fault = kvm_inject_page_fault; -} - -static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, - union kvm_cpu_role new_mode) -{ - struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; - - if (new_mode.as_u64 == g_context->cpu_role.as_u64) - return; - - g_context->cpu_role.as_u64 = new_mode.as_u64; - g_context->get_guest_pgd = get_guest_cr3; - g_context->get_pdptr = kvm_pdptr_read; - g_context->inject_page_fault = kvm_inject_page_fault; - - /* - * L2 page tables are never shadowed, so there is no need to sync - * SPTEs. - */ - g_context->sync_spte = NULL; - - /* - * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using - * L1's nested page tables (e.g. EPT12). The nested translation - * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using - * L2's page tables as the first level of translation and L1's - * nested page tables as the second level of translation. Basically - * the gva_to_gpa functions between mmu and nested_mmu are swapped. - */ - if (!is_paging(vcpu)) - g_context->gva_to_gpa = nonpaging_gva_to_gpa; - else if (is_long_mode(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa; - else if (is_pae(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa; - else - g_context->gva_to_gpa = paging32_gva_to_gpa; - - reset_guest_paging_metadata(vcpu, g_context); -} - void kvm_init_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, ®s); - if (mmu_is_nested(vcpu)) - init_kvm_nested_mmu(vcpu, cpu_role); - else if (tdp_enabled) - init_kvm_tdp_mmu(vcpu, cpu_role); - else - init_kvm_softmmu(vcpu, cpu_role); + init_kvm_page_walk(vcpu, &vcpu->arch.gva_walk, cpu_role); + + if (!mmu_is_nested(vcpu)) { + if (tdp_enabled) + init_kvm_tdp_mmu(vcpu, cpu_role); + else + init_kvm_shadow_mmu(vcpu, cpu_role); + } } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu); @@ -6127,10 +6171,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) */ vcpu->arch.root_mmu.root_role.invalid = 1; vcpu->arch.guest_mmu.root_role.invalid = 1; - vcpu->arch.nested_mmu.root_role.invalid = 1; - vcpu->arch.root_mmu.cpu_role.ext.valid = 0; - vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; - vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; + vcpu->arch.ngpa_walk.cpu_role.ext.valid = 0; + vcpu->arch.gva_walk.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm); @@ -6617,22 +6659,31 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu write_unlock(&vcpu->kvm->mmu_lock); } -void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, u64 addr, unsigned long roots) { + struct kvm_mmu *mmu; int i; WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); /* It's actually a GPA for vcpu->arch.guest_mmu. */ - if (mmu != &vcpu->arch.guest_mmu) { + if (w == &vcpu->arch.gva_walk) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_invlpg_address(addr, vcpu)) return; kvm_x86_call(flush_tlb_gva)(vcpu, addr); + + if (tdp_enabled) + return; + + mmu = &vcpu->arch.root_mmu; + } else { + mmu = &vcpu->arch.guest_mmu; } + /* Invalidate shadow pages, whether GPA->GVA or nGPA->GPA. */ if (!mmu->sync_spte) return; @@ -6658,7 +6709,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) * be synced when switching to that new cr3, so nothing needs to be * done here for them. */ - kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); + kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.gva_walk, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg); @@ -6680,7 +6731,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (roots) - kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); + kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.gva_walk, gva, roots); ++vcpu->stat.invlpg; /* @@ -6725,11 +6776,12 @@ static void free_mmu_pages(struct kvm_mmu *mmu) free_page((unsigned long)mmu->pml5_root); } -static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct kvm_pagewalk *w) { struct page *page; int i; + mmu->w = w; mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; mmu->mirror_root_hpa = INVALID_PAGE; @@ -6795,13 +6847,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; - vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu, &vcpu->arch.ngpa_walk); if (ret) return ret; - ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu, &vcpu->arch.gva_walk); if (ret) goto fail_allocate_root; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 73cdcbccc89e..c29002c60126 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -290,8 +290,6 @@ struct kvm_page_fault { bool write_fault_to_shadow_pgtable; }; -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); - /* * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(), * and of course kvm_mmu_do_page_fault(). @@ -337,70 +335,6 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, fault->is_private); } -static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u64 err, bool prefetch, - int *emulation_type, u8 *level) -{ - struct kvm_page_fault fault = { - .addr = cr2_or_gpa, - .error_code = err, - .exec = err & PFERR_FETCH_MASK, - .write = err & PFERR_WRITE_MASK, - .present = err & PFERR_PRESENT_MASK, - .rsvd = err & PFERR_RSVD_MASK, - .user = err & PFERR_USER_MASK, - .prefetch = prefetch, - .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = - is_nx_huge_page_enabled(vcpu->kvm), - - .max_level = KVM_MAX_HUGEPAGE_LEVEL, - .req_level = PG_LEVEL_4K, - .goal_level = PG_LEVEL_4K, - .is_private = err & PFERR_PRIVATE_ACCESS, - - .pfn = KVM_PFN_ERR_FAULT, - }; - int r; - - if (vcpu->arch.mmu->root_role.direct) { - /* - * Things like memslots don't understand the concept of a shared - * bit. Strip it so that the GFN can be used like normal, and the - * fault.addr can be used when the shared bit is needed. - */ - fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm); - fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); - } - - /* - * With retpoline being active an indirect call is rather expensive, - * so do a direct call in the most common case. - */ - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) - r = kvm_tdp_page_fault(vcpu, &fault); - else - r = vcpu->arch.mmu->page_fault(vcpu, &fault); - - /* - * Not sure what's happening, but punt to userspace and hope that - * they can fix it by changing memory to shared, or they can - * provide a better error. - */ - if (r == RET_PF_EMULATE && fault.is_private) { - pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); - kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); - return -EFAULT; - } - - if (fault.write_fault_to_shadow_pgtable && emulation_type) - *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; - if (level) - *level = fault.goal_level; - - return r; -} - int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index df3ae0c7ec2c..e73fc09ec4db 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -55,7 +55,7 @@ #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 - #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) + #define PT_HAVE_ACCESSED_DIRTY(w) (!(w)->cpu_role.base.ad_disabled) #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value @@ -106,13 +106,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, +static inline void FNAME(protect_clean_gpte)(struct kvm_pagewalk *w, unsigned *access, unsigned gpte) { unsigned mask; /* dirty bit is not supported, so no need to track it */ - if (!PT_HAVE_ACCESSED_DIRTY(mmu)) + if (!PT_HAVE_ACCESSED_DIRTY(w)) return; BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); @@ -124,7 +124,7 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce *access &= mask; } -static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu, +static inline int FNAME(is_present_gpte)(struct kvm_pagewalk *w, unsigned long pte) { #if PTTYPE != PTTYPE_EPT @@ -134,38 +134,40 @@ static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu, * For EPT, an entry is present if any of bits 2:0 are set. * With mode-based execute control, bit 10 also indicates presence. */ - return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0)); + return pte & (7 | (is_cr4_smep(w) ? VMX_EPT_USER_EXECUTABLE_MASK : 0)); #endif } -static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) +static bool FNAME(is_bad_mt_xwr)(struct kvm_page_format *fmt, u64 gpte) { #if PTTYPE != PTTYPE_EPT return false; #else - return __is_bad_mt_xwr(rsvd_check, gpte); + return __is_bad_mt_xwr(fmt, gpte); #endif } -static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +static bool FNAME(is_rsvd_bits_set)(struct kvm_page_format *fmt, u64 gpte, int level) { - return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || - FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); + return __is_rsvd_bits_set(fmt, gpte, level) || + FNAME(is_bad_mt_xwr)(fmt, gpte); } static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte)) + struct kvm_pagewalk *w = vcpu->arch.mmu->w; + + if (!FNAME(is_present_gpte)(w, gpte)) goto no_present; /* Prefetch only accessed entries (unless A/D bits are disabled). */ - if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && + if (PT_HAVE_ACCESSED_DIRTY(w) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; - if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) + if (FNAME(is_rsvd_bits_set)(&w->fmt, gpte, PG_LEVEL_4K)) goto no_present; return false; @@ -206,7 +208,7 @@ static inline unsigned FNAME(gpte_access)(u64 gpte) } static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, + struct kvm_pagewalk *w, struct guest_walker *walker, gpa_t addr, int write_fault) { @@ -217,7 +219,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, int ret; /* dirty/accessed bits are not supported, so no need to update them */ - if (!PT_HAVE_ACCESSED_DIRTY(mmu)) + if (!PT_HAVE_ACCESSED_DIRTY(w)) return 0; for (level = walker->max_level; level >= walker->level; --level) { @@ -278,7 +280,7 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) return pkeys; } -static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, +static inline bool FNAME(is_last_gpte)(struct kvm_pagewalk *w, unsigned int level, unsigned int gpte) { /* @@ -296,7 +298,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, * is not reserved and does not indicate a large page at this level, * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. */ - gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); + gpte &= level - (PT32_ROOT_LEVEL + w->cpu_role.ext.cr4_pse); #endif /* * PG_LEVEL_4K always terminates. The RHS has bit 7 set @@ -311,7 +313,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, gpa_t addr, u64 access) { int ret; @@ -340,16 +342,16 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - walker->level = mmu->cpu_role.base.level; - pte = kvm_mmu_get_guest_pgd(vcpu, mmu); - have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); + walker->level = w->cpu_role.base.level; + pte = kvm_mmu_get_guest_pgd(vcpu, w); + have_ad = PT_HAVE_ACCESSED_DIRTY(w); #if PTTYPE == 64 walk_nx_mask = 1ULL << PT64_NX_SHIFT; if (walker->level == PT32E_ROOT_LEVEL) { - pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); + pte = w->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!FNAME(is_present_gpte)(mmu, pte)) + if (!FNAME(is_present_gpte)(w, pte)) goto error; --walker->level; } @@ -393,7 +395,7 @@ retry_walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), + real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(table_gfn), nested_access | PFERR_GUEST_PAGE_MASK, &walker->fault, 0); @@ -422,10 +424,10 @@ retry_walk: */ pte_access = pt_access & (pte ^ walk_nx_mask); - if (unlikely(!FNAME(is_present_gpte)(mmu, pte))) + if (unlikely(!FNAME(is_present_gpte)(w, pte))) goto error; - if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(&w->fmt, pte, walker->level))) { errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } @@ -434,14 +436,14 @@ retry_walk: /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); - } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); + } while (!FNAME(is_last_gpte)(w, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); - errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); + errcode = permission_fault(vcpu, w, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -453,7 +455,7 @@ retry_walk: gfn += pse36_gfn_delta(pte); #endif - real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), + real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(gfn), access | PFERR_GUEST_FINAL_MASK, &walker->fault, walker->pte_access); if (real_gpa == INVALID_GPA) @@ -462,7 +464,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); + FNAME(protect_clean_gpte)(w, &walker->pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -473,7 +475,7 @@ retry_walk: (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, + ret = FNAME(update_accessed_dirty_bits)(vcpu, w, walker, addr, write_fault); if (unlikely(ret < 0)) goto error; @@ -485,7 +487,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && has_pferr_fetch(mmu)) + if (fetch_fault && has_pferr_fetch(w)) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; @@ -540,13 +542,13 @@ error: * ACC_*_MASK flags! */ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); - if (mmu_has_mbec(mmu)) + if (is_cr4_smep(w)) walker->fault.exit_qualification |= EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access); } #endif walker->fault.address = addr; - walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + walker->fault.nested_page_fault = w != &vcpu->arch.gva_walk; walker->fault.async_page_fault = false; #if PTTYPE != PTTYPE_EPT @@ -561,7 +563,7 @@ error: static int FNAME(walk_addr)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gpa_t addr, u64 access) { - return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, + return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu->w, addr, access); } @@ -577,7 +579,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu->w, &pte_access, gpte); return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); } @@ -660,7 +662,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; - top_level = vcpu->arch.mmu->cpu_role.base.level; + top_level = vcpu->arch.mmu->w->cpu_role.base.level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; /* @@ -849,7 +851,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * otherwise KVM will cache incorrect access information in the SPTE. */ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && - !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { + !is_cr0_wp(vcpu->arch.mmu->w) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -859,7 +861,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * then we should prevent the kernel from executing it * if SMEP is enabled. */ - if (is_cr4_smep(vcpu->arch.mmu)) + if (is_cr4_smep(vcpu->arch.mmu->w)) walker.pte_access &= ~ACC_EXEC_MASK; } #endif @@ -894,7 +896,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w, gpa_t addr, u64 access, struct x86_exception *exception) { @@ -904,10 +906,10 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, #ifndef CONFIG_X86_64 /* A 64-bit GVA should be impossible on 32-bit KVM. */ - WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); + WARN_ON_ONCE((addr >> 32) && w == &vcpu->arch.gva_walk); #endif - r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); + r = FNAME(walk_addr_generic)(&walker, vcpu, w, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); @@ -957,7 +959,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu->w, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) return 0; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 72d2394e089c..5fc27e9733b3 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -281,9 +281,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); - WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), + WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->fmt, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, - get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + get_rsvd_bits(&vcpu->arch.mmu->fmt, spte, level)); /* * Mark the memslot dirty *after* modifying it for access tracking. diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 13eea94dd212..e730717824b3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -357,17 +357,6 @@ static inline bool is_last_spte(u64 pte, int level) return (level == PG_LEVEL_4K) || is_large_pte(pte); } -static inline bool is_executable_pte(u64 spte) -{ - /* - * For now, return true if either the XS or XU bit is set - * This function is only used for fast_page_fault, - * which never processes shadow EPT, and regular page - * tables always have XS==XU. - */ - return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask; -} - static inline kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -378,33 +367,33 @@ static inline bool is_accessed_spte(u64 spte) return spte & shadow_accessed_mask; } -static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, +static inline u64 get_rsvd_bits(struct kvm_page_format *fmt, u64 pte, int level) { int bit7 = (pte >> 7) & 1; - return rsvd_check->rsvd_bits_mask[bit7][level-1]; + return fmt->rsvd_bits_mask[bit7][level-1]; } -static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, +static inline bool __is_rsvd_bits_set(struct kvm_page_format *fmt, u64 pte, int level) { - return pte & get_rsvd_bits(rsvd_check, pte, level); + return pte & get_rsvd_bits(fmt, pte, level); } -static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, +static inline bool __is_bad_mt_xwr(struct kvm_page_format *fmt, u64 pte) { if (pte & VMX_EPT_USER_EXECUTABLE_MASK) pte |= VMX_EPT_EXECUTABLE_MASK; - return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); + return fmt->bad_mt_xwr & BIT_ULL(pte & 0x3f); } -static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, +static __always_inline bool is_rsvd_spte(struct kvm_page_format *fmt, u64 spte, int level) { - return __is_bad_mt_xwr(rsvd_check, spte) || - __is_rsvd_bits_set(rsvd_check, spte, level); + return __is_bad_mt_xwr(fmt, spte) || + __is_rsvd_bits_set(fmt, spte, level); } /* @@ -496,20 +485,40 @@ static inline bool is_mmu_writable_spte(u64 spte) } /* - * Returns true if the access indicated by @fault is allowed by the existing - * SPTE protections. Note, the caller is responsible for checking that the - * SPTE is a shadow-present, leaf SPTE (either before or after). + * Returns true if the access indicated by @fault is forbidden by the existing + * SPTE protections. */ -static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +static inline bool spte_permission_fault(struct kvm_mmu *mmu, u64 spte, + struct kvm_page_fault *fault) { - if (fault->exec) - return is_executable_pte(spte); + unsigned pfec, pte_access; - if (fault->write) - return is_writable_pte(spte); + if (!is_shadow_present_pte(spte)) + return true; - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; + BUILD_BUG_ON(PT_PRESENT_MASK != ACC_READ_MASK); + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != ACC_READ_MASK); + BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != ACC_WRITE_MASK); + + /* strip nested paging fault error codes */ + pte_access = spte & (PT_PRESENT_MASK | PT_WRITABLE_MASK); + if (shadow_nx_mask) { + pte_access |= spte & shadow_user_mask ? ACC_USER_MASK : 0; + pte_access |= spte & shadow_nx_mask ? 0 : ACC_EXEC_MASK; + } else { + pte_access |= spte & shadow_xs_mask ? ACC_EXEC_MASK : 0; + pte_access |= spte & shadow_xu_mask ? ACC_USER_EXEC_MASK : 0; + } + + /* + * RSVD is handled elsewhere, and is used for SMAP in the context + * of accessing fmt.permissions[]. SPTEs never use PK or SS, as + * they are not supported for shadow paging and irrelevant for TDP. + */ + pfec = fault->error_code & ( + PFERR_WRITE_MASK | PFERR_USER_MASK | PFERR_FETCH_MASK); + return (mmu->fmt.permissions[pfec >> 1] >> pte_access) & 1; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c1cbae65d239..ce3f2efadb05 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1122,6 +1122,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct tdp_iter *iter) { + struct kvm_mmu *mmu = vcpu->arch.mmu; struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); u64 new_spte; int ret = RET_PF_FIXED; @@ -1131,7 +1132,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, return RET_PF_RETRY; if (is_shadow_present_pte(iter->old_spte) && - (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + (fault->prefetch || !spte_permission_fault(mmu, iter->old_spte, fault)) && is_last_spte(iter->old_spte, iter->level)) { WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); return RET_PF_SPURIOUS; diff --git a/arch/x86/kvm/msrs.c b/arch/x86/kvm/msrs.c new file mode 100644 index 000000000000..c230b18d87e3 --- /dev/null +++ b/arch/x86/kvm/msrs.c @@ -0,0 +1,2745 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm_host.h> +#include <asm/intel_pt.h> +#include <asm/vmx.h> + +#include "hyperv.h" +#include "lapic.h" +#include "msrs.h" +#include "pmu.h" +#include "trace.h" +#include "vmx/vmx.h" +#include "xen.h" +#include "x86.h" + +bool __read_mostly ignore_msrs = 0; +module_param(ignore_msrs, bool, 0644); + +bool __read_mostly report_ignored_msrs = true; +module_param(report_ignored_msrs, bool, 0644); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); + +/* EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static +u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); +#else +static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); +#endif + +#define MAX_IO_MSRS 256 + +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + +struct kvm_x86_msr_filter { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; +}; + +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 + +struct kvm_user_return_msrs { + struct user_return_notifier urn; + bool registered; + struct kvm_user_return_msr_values { + u64 host; + u64 curr; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; +}; + +u32 __read_mostly kvm_nr_uret_msrs; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); +static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; +static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); + +void kvm_destroy_user_return_msrs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); + + kvm_nr_uret_msrs = 0; +} + +static void kvm_on_user_return(struct user_return_notifier *urn) +{ + unsigned slot; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; + + msrs->registered = false; + user_return_notifier_unregister(urn); + + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { + values = &msrs->values[slot]; + if (values->host != values->curr) { + wrmsrq(kvm_uret_msrs_list[slot], values->host); + values->curr = values->host; + } + } +} + +static int kvm_probe_user_return_msr(u32 msr) +{ + u64 val; + int ret; + + preempt_disable(); + ret = rdmsrq_safe(msr, &val); + if (ret) + goto out; + ret = wrmsrq_safe(msr, val); +out: + preempt_enable(); + return ret; +} + +int kvm_add_user_return_msr(u32 msr) +{ + BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); + + if (kvm_probe_user_return_msr(msr)) + return -1; + + kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; + return kvm_nr_uret_msrs++; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr); + +int kvm_find_user_return_msr(u32 msr) +{ + int i; + + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + if (kvm_uret_msrs_list[i] == msr) + return i; + } + return -1; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); + +void kvm_user_return_msr_cpu_online(void) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); + u64 value; + int i; + + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + rdmsrq_safe(kvm_uret_msrs_list[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; + } +} + +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); + int err; + + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) + return 0; + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); + if (err) + return 1; + + msrs->values[slot].curr = value; + kvm_user_return_register_notifier(msrs); + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); + +u64 kvm_get_user_return_msr(unsigned int slot) +{ + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); + +void drop_user_return_notifiers(void) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); + + if (msrs->registered) + kvm_on_user_return(&msrs->urn); +} + +/* + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. + */ + +static const u32 msrs_to_save_base[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, + + MSR_IA32_U_CET, MSR_IA32_S_CET, + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, + MSR_IA32_DEBUGCTLMSR, + MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP, + MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, +}; + +static const u32 msrs_to_save_pmu[] = { + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, +}; + +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; +static unsigned num_msrs_to_save; + +static const u32 emulated_msrs_all[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif + + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, + + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSC_DEADLINE, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, + MSR_IA32_SMBASE, + MSR_SMI_COUNT, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, + MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, + + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, +}; + +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; +static unsigned num_emulated_msrs; + +/* + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. + */ +static const u32 msr_based_features_all_except_vmx[] = { + MSR_AMD64_DE_CFG, + MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_PLATFORM_INFO, +}; + +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; +static unsigned int num_msr_based_features; + +int kvm_get_msr_index_list(struct kvm_msr_list __user *user_msr_list) +{ + struct kvm_msr_list msr_list; + unsigned int n; + + if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) + return -EFAULT; + + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; + if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) + return -EFAULT; + + if (n < msr_list.nmsrs) + return -E2BIG; + + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + return -EFAULT; + + if (copy_to_user(user_msr_list->indices + num_msrs_to_save, + &emulated_msrs, num_emulated_msrs * sizeof(u32))) + return -EFAULT; + + return 0; +} + +int kvm_get_feature_msr_index_list(struct kvm_msr_list __user *user_msr_list) +{ + struct kvm_msr_list msr_list; + unsigned int n; + + if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) + return -EFAULT; + + n = msr_list.nmsrs; + msr_list.nmsrs = num_msr_based_features; + if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) + return -EFAULT; + + if (n < msr_list.nmsrs) + return -E2BIG; + + if (copy_to_user(user_msr_list->indices, &msr_based_features, + num_msr_based_features * sizeof(u32))) + return -EFAULT; + + return 0; +} + +/* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. + */ +static bool kvm_is_immutable_feature_msr(u32 msr) +{ + int i; + + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) + return true; + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +static bool kvm_is_advertised_msr(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + for (i = 0; i < num_emulated_msrs; i++) { + if (emulated_msrs[i] == msr_index) + return true; + } + + return false; +} + + +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS + */ + +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) + +u64 kvm_get_arch_capabilities(void) +{ + u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP; + + /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + + /* + * If we're doing cache flushes (either "always" or "cond") + * we will do one whenever the guest does a vmlaunch/vmresume. + * If an outer hypervisor is doing the cache flush for us + * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that + * capability to the guest too, and if EPT is disabled we're not + * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will + * require a nested hypervisor to do a flush of its own. + */ + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + if (!boot_cpu_has_bug(X86_BUG_RFDS)) + data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; + + if (!boot_cpu_has(X86_FEATURE_RTM)) { + /* + * If RTM=0 because the kernel has disabled TSX, the host might + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 + * and therefore knows that there cannot be TAA) but keep + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, + * and we want to allow migrating those guests to tsx=off hosts. + */ + data &= ~ARCH_CAP_TAA_NO; + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { + data |= ARCH_CAP_TAA_NO; + } else { + /* + * Nothing to do here; we emulate TSX_CTRL if present on the + * host so the guest can choose between disabling TSX or + * using VERW to clear CPU buffers. + */ + } + + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) + data |= ARCH_CAP_GDS_NO; + + return data; +} + +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + WARN_ON_ONCE(!host_initiated); + + switch (index) { + case MSR_IA32_ARCH_CAPABILITIES: + *data = kvm_get_arch_capabilities(); + break; + case MSR_IA32_PERF_CAPABILITIES: + *data = kvm_caps.supported_perf_cap; + break; + case MSR_PLATFORM_INFO: + *data = MSR_PLATFORM_INFO_CPUID_FAULT; + break; + case MSR_IA32_UCODE_REV: + rdmsrq_safe(index, data); + break; + default: + return kvm_x86_call(get_feature_msr)(index, data); + } + return 0; +} + +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated); + +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, + u64 *data, bool host_initiated, + enum kvm_msr_access rw, + msr_access_t msr_access_fn) +{ + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; + int ret; + + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); + + /* + * Zero the data on read failures to avoid leaking stack data to the + * guest and/or userspace, e.g. if the failure is ignored below. + */ + ret = msr_access_fn(vcpu, msr, data, host_initiated); + if (ret && rw == MSR_TYPE_R) + *data = 0; + + if (ret != KVM_MSR_RET_UNSUPPORTED) + return ret; + + /* + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM + * advertises to userspace, even if an MSR isn't fully supported. + * Simply check that @data is '0', which covers both the write '0' case + * and all reads (in which case @data is zeroed on failure; see above). + */ + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) + return 0; + + if (!ignore_msrs) { + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", + op, msr, *data); + return ret; + } + + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); + + return 0; +} + +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, + kvm_get_feature_msr); +} + +static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS)) + return false; + + if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT)) + return false; + + if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) + return false; + + if (efer & (EFER_LME | EFER_LMA) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return false; + + if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX)) + return false; + + return true; + +} +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & efer_reserved_bits) + return false; + + return __kvm_valid_efer(vcpu, efer); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); + +static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + u64 old_efer = vcpu->arch.efer; + u64 efer = msr_info->data; + int r; + + if (efer & efer_reserved_bits) + return 1; + + if (!msr_info->host_initiated) { + if (!__kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) && + (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + } + + efer &= ~EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; + + r = kvm_x86_call(set_efer)(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } + + if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) + kvm_mmu_reset_context(vcpu); + + if (!static_cpu_has(X86_FEATURE_XSAVES) && + (efer & EFER_SVME)) + kvm_hv_xsaves_xsavec_maybe_warn(vcpu); + + return 0; +} + +void kvm_enable_efer_bits(u64 mask) +{ + efer_reserved_bits &= ~mask; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits); + +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; + struct kvm *kvm = vcpu->kvm; + bool allowed; + int idx; + u32 i; + + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) + return true; + + idx = srcu_read_lock(&kvm->srcu); + + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + allowed = test_bit(index - start, bitmap); + break; + } + } + +out: + srcu_read_unlock(&kvm->srcu, idx); + + return allowed; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); + +/* + * Write @data into the MSR specified by @index. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, + bool host_initiated) +{ + struct msr_data msr; + + switch (index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + data = __canonical_address(data, max_host_virt_addr_bits()); + break; + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) + return 1; + + /* + * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has + * incomplete and conflicting architectural behavior. Current + * AMD CPUs completely ignore bits 63:32, i.e. they aren't + * reserved and always read as zeros. Enforce Intel's reserved + * bits check if the guest CPU is Intel compatible, otherwise + * clear the bits. This ensures cross-vendor migration will + * provide consistent behavior for the guest. + */ + if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0) + return 1; + + data = (u32)data; + break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + if (!kvm_is_valid_u_s_cet(vcpu, data)) + return 1; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) + return 1; + break; + } + + msr.data = data; + msr.index = index; + msr.host_initiated = host_initiated; + + return kvm_x86_call(set_msr)(vcpu, &msr); +} + +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + return __kvm_set_msr(vcpu, index, *data, host_initiated); +} + +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, + _kvm_set_msr); +} + +/* + * Read the MSR specified by @index into @data. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + struct msr_data msr; + int ret; + + switch (index) { + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) + return 1; + break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + break; + } + + msr.index = index; + msr.host_initiated = host_initiated; + + ret = kvm_x86_call(get_msr)(vcpu, &msr); + if (!ret) + *data = msr.data; + return ret; +} + +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, + __kvm_get_msr); +} + +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, true); +} + +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return kvm_get_msr_ignored_check(vcpu, index, data, false); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read); + +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return kvm_set_msr_ignored_check(vcpu, index, data, false); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write); + +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_read(vcpu, index, data); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read); + +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_write(vcpu, index, data); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write); + +static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + + switch (msr) { + case APIC_BASE_MSR + (APIC_ICR >> 4): + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; + break; + case MSR_IA32_TSC_DEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; + default: + return EXIT_FASTPATH_NONE; + } + + trace_kvm_msr_write(msr, data); + + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +{ + return __handle_fastpath_wrmsr(vcpu, kvm_ecx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); + +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); + +static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) { + kvm_eax_write(vcpu, vcpu->run->msr.data); + kvm_edx_write(vcpu, vcpu->run->msr.data >> 32); + } +} + +static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_COMPLETE_USER_EXIT); +} + +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) +{ + return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_emulated_msr_access(vcpu); +} + +static int complete_fast_msr_access(struct kvm_vcpu *vcpu) +{ + return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error); +} + +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_fast_msr_access(vcpu); +} + +static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); + + return complete_fast_msr_access(vcpu); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case KVM_MSR_RET_UNSUPPORTED: + return KVM_MSR_EXIT_REASON_UNKNOWN; + case KVM_MSR_RET_FILTERED: + return KVM_MSR_EXIT_REASON_FILTER; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, + int (*complete_rdmsr)(struct kvm_vcpu *)) +{ + u64 data; + int r; + + r = kvm_emulate_msr_read(vcpu, msr, &data); + + if (!r) { + trace_kvm_msr_read(msr, data); + + if (reg < 0) { + kvm_eax_write(vcpu, data); + kvm_edx_write(vcpu, data >> 32); + } else { + kvm_register_write(vcpu, reg, data); + } + } else { + /* MSR read failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, + complete_rdmsr, r)) + return 0; + trace_kvm_msr_read_ex(msr); + } + + return kvm_x86_call(complete_emulated_msr)(vcpu, r); +} + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_rdmsr(vcpu, kvm_ecx_read(vcpu), -1, + complete_fast_rdmsr); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr); + +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + vcpu->arch.cui_rdmsr_imm_reg = reg; + + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm); + +static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int r; + + r = kvm_emulate_msr_write(vcpu, msr, data); + if (!r) { + trace_kvm_msr_write(msr, data); + } else { + /* MSR write failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, + complete_fast_msr_access, r)) + return 0; + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; + trace_kvm_msr_write_ex(msr, data); + } + + return kvm_x86_call(complete_emulated_msr)(vcpu, r); +} + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_wrmsr(vcpu, kvm_ecx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); + +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); + +int kvm_emulator_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index, + u64 *pdata) +{ + int r; + + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); + if (r < 0) + return X86EMUL_UNHANDLEABLE; + + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_read_ex(msr_index); + return X86EMUL_PROPAGATE_FAULT; + } + + trace_kvm_msr_read(msr_index, *pdata); + return X86EMUL_CONTINUE; +} + +int kvm_emulator_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index, + u64 data) +{ + int r; + + r = kvm_emulate_msr_write(vcpu, msr_index, data); + if (r < 0) + return X86EMUL_UNHANDLEABLE; + + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_msr_access, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_write_ex(msr_index, data); + return X86EMUL_PROPAGATE_FAULT; + } + + trace_kvm_msr_write(msr_index, data); + return X86EMUL_CONTINUE; +} + +int kvm_emulator_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + /* + * Treat emulator accesses to the current shadow stack pointer as host- + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), + * and this API is used only for implicit accesses, i.e. not RDMSR, and + * so the index is fully KVM-controlled. + */ + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) + return kvm_msr_read(vcpu, msr_index, pdata); + + return __kvm_emulate_msr_read(vcpu, msr_index, pdata); +} + +/* + * Returns true if the MSR in question is managed via XSTATE, i.e. is context + * switched with the rest of guest FPU state. + * + * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. + */ +static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (!vcpu) + return false; + + switch (msr) { + case MSR_IA32_U_CET: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + default: + return false; + } +} + +/* + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an + * MSR that is managed via XSTATE. Note, the caller is responsible for doing + * the initial FPU load, this helper only ensures that guest state is resident + * in hardware (the kernel can load its FPU state in IRQ context). + * + * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the + * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only + * consumed when transitioning to lower privilege levels, i.e. are effectively + * only consumed by userspace as well. + */ +static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, + int access) +{ + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); + + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); + + kvm_fpu_get(); + if (access == MSR_TYPE_R) + rdmsrq(msr_info->index, msr_info->data); + else + wrmsrq(msr_info->index, msr_info->data); + kvm_fpu_put(); +} + +static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); +} + +static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); +} + +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) +{ + int version; + int r; + struct pvclock_wall_clock wc; + u32 wc_sec_hi; + u64 wall_nsec; + + if (!wall_clock) + return; + + r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); + if (r) + return; + + if (version & 1) + ++version; /* first time write, random junk */ + + ++version; + + if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) + return; + + wall_nsec = kvm_get_wall_clock_epoch(kvm); + + wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); + wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ + wc.version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + + if (sec_hi_ofs) { + wc_sec_hi = wall_nsec >> 32; + kvm_write_guest(kvm, wall_clock + sec_hi_ofs, + &wc_sec_hi, sizeof(wc_sec_hi)); + } + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); +} + +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock != old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + if (system_time & 1) + kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); + else + kvm_gpc_deactivate(&vcpu->arch.pv_time); + + return; +} + +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + +/* + * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. + */ +static bool can_set_mci_status(struct kvm_vcpu *vcpu) +{ + /* McStatusWrEn enabled? */ + if (guest_cpuid_is_amd_compatible(vcpu)) + return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); + + return false; +} + +static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u32 msr = msr_info->index; + u64 data = msr_info->data; + u32 offset, last_msr; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P) && + (data || !msr_info->host_initiated)) + return 1; + if (data != 0 && data != ~(u64)0) + return 1; + vcpu->arch.mcg_ctl = data; + break; + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; + + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: + return 1; + } + return 0; +} + +static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) +{ + gpa_t gpa = data & ~0x3f; + + /* Bits 4:5 are reserved, Should be zero */ + if (data & 0x30) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + + if (!lapic_in_kernel(vcpu)) + return data ? 1 : 0; + + if (__kvm_pv_async_pf_enabled(data) && + kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + sizeof(u64))) + return 1; + + vcpu->arch.apf.msr_en_val = data; + + if (__kvm_pv_async_pf_enabled(data)) { + kvm_async_pf_wakeup_all(vcpu); + } else { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } + return 0; +} + +static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) +{ + /* Bits 8-63 are reserved */ + if (data >> 8) + return 1; + + if (!lapic_in_kernel(vcpu)) + return 1; + + vcpu->arch.apf.msr_int_val = data; + + vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; + + return 0; +} + +#ifdef CONFIG_X86_64 +static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; +} +#endif + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + u32 msr = msr_info->index; + u64 data = msr_info->data; + + /* + * Do not allow host-initiated writes to trigger the Xen hypercall + * page setup; it could incur locking paths which are not expected + * if userspace sets the MSR in an unusual location. + */ + if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && + !msr_info->host_initiated) + return kvm_xen_write_hypercall_page(vcpu, data); + + switch (msr) { + case MSR_AMD64_NB_CFG: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + case MSR_AMD64_BU_CFG2: + case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: + case MSR_F15H_EX_CFG: + break; + + case MSR_IA32_UCODE_REV: + if (msr_info->host_initiated) + vcpu->arch.microcode_version = data; + break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated || + !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; + vcpu->arch.arch_capabilities = data; + break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated || + !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; + + if (data & ~kvm_caps.supported_perf_cap) + return 1; + + /* + * Note, this is not just a performance optimization! KVM + * disallows changing feature MSRs after the vCPU has run; PMU + * refresh will bug the VM if called after the vCPU has run. + */ + if (vcpu->arch.perf_capabilities == data) + break; + + vcpu->arch.perf_capabilities = data; + kvm_pmu_refresh(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); + break; + case MSR_IA32_PRED_CMD: { + u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); + + if (!msr_info->host_initiated) { + if ((!guest_has_pred_cmd_msr(vcpu))) + return 1; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + } + + if (!boot_cpu_has(X86_FEATURE_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!boot_cpu_has(X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + + if (data & reserved_bits) + return 1; + + if (!data) + break; + + wrmsrq(MSR_IA32_PRED_CMD, data); + break; + } + case MSR_IA32_FLUSH_CMD: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)) + return 1; + + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) + return 1; + if (!data) + break; + + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + break; + case MSR_EFER: + return set_efer(vcpu, msr_info); + case MSR_K7_HWCR: { + /* + * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 + * through at least v6.6 whine if TscFreqSel is clear, + * depending on F/M/S. + */ + u64 valid = BIT_ULL(18) | BIT_ULL(24); + + data &= ~(u64)0x40; /* ignore flush filter disable */ + data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_GP_ON_USER_CPUID)) + valid |= MSR_K7_HWCR_CPUID_USER_DIS; + + if (data & ~valid) { + kvm_pr_unimpl_wrmsr(vcpu, msr, data); + return 1; + } + vcpu->arch.msr_hwcr = data; + break; + } + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + kvm_pr_unimpl_wrmsr(vcpu, msr, data); + return 1; + } + break; + case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + + vcpu->arch.pat = data; + break; + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + return kvm_mtrr_set_msr(vcpu, msr, data); + case MSR_IA32_APICBASE: + return kvm_apic_set_base(vcpu, data, msr_info->host_initiated); + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_TSC_DEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; + case MSR_IA32_TSC_ADJUST: + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) { + if (!msr_info->host_initiated) { + s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } + break; + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) + return 1; + vcpu->arch.ia32_misc_enable_msr = data; + vcpu->arch.cpuid_dynamic_bits_dirty = true; + } else { + vcpu->arch.ia32_misc_enable_msr = data; + } + break; + } + case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) + return 1; + vcpu->arch.smbase = data; + break; + case MSR_IA32_POWER_CTL: + vcpu->arch.msr_ia32_power_ctl = data; + break; + case MSR_IA32_TSC: + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, &data); + } else if (!vcpu->arch.guest_tsc_protected) { + u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } + break; + case MSR_IA32_XSS: + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return KVM_MSR_RET_UNSUPPORTED; + + if (data & ~vcpu->arch.guest_supported_xss) + return 1; + if (vcpu->arch.ia32_xss == data) + break; + vcpu->arch.ia32_xss = data; + vcpu->arch.cpuid_dynamic_bits_dirty = true; + break; + case MSR_SMI_COUNT: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.smi_count = data; + break; + case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return KVM_MSR_RET_UNSUPPORTED; + + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); + break; + case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return KVM_MSR_RET_UNSUPPORTED; + + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); + break; + case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return KVM_MSR_RET_UNSUPPORTED; + + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); + break; + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return KVM_MSR_RET_UNSUPPORTED; + + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); + break; + case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return KVM_MSR_RET_UNSUPPORTED; + + if (kvm_pv_enable_async_pf(vcpu, data)) + return 1; + break; + case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return KVM_MSR_RET_UNSUPPORTED; + + if (kvm_pv_enable_async_pf_int(vcpu, data)) + return 1; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return KVM_MSR_RET_UNSUPPORTED; + if (data & 0x1) { + /* + * Pairs with the smp_mb__after_atomic() in + * kvm_arch_async_page_present_queued(). + */ + smp_store_mb(vcpu->arch.apf.pageready_pending, false); + + kvm_check_async_pf_completion(vcpu); + } + break; + case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return KVM_MSR_RET_UNSUPPORTED; + + if (unlikely(!sched_info_on())) + return 1; + + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) + break; + + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + + break; + case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return KVM_MSR_RET_UNSUPPORTED; + + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) + return 1; + break; + + case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return KVM_MSR_RET_UNSUPPORTED; + + /* only enable bit supported */ + if (data & (-1ULL << 1)) + return 1; + + vcpu->arch.msr_kvm_poll_control = data; + break; + + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + return set_msr_mce(vcpu, msr_info); + + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); + break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to specify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; +#ifdef CONFIG_KVM_HYPERV + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return kvm_hv_set_msr_common(vcpu, msr, data, + msr_info->host_initiated); +#endif + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ + kvm_pr_unimpl_wrmsr(vcpu, msr, data); + break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) + return 1; + vcpu->arch.osvw.status = data; + break; + case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.msr_platform_info = data; + break; + case MSR_MISC_FEATURES_ENABLES: + if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || + (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + !(vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT))) + return 1; + vcpu->arch.msr_misc_features_enables = data; + break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~kvm_guest_supported_xfd(vcpu)) + return 1; + + fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~kvm_guest_supported_xfd(vcpu)) + return 1; + + vcpu->arch.guest_fpu.xfd_err = data; + break; +#endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_set_xstate_msr(vcpu, msr_info); + break; + default: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + + return KVM_MSR_RET_UNSUPPORTED; + } + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common); + +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + u64 data; + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; + + switch (msr) { + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + data = 0; + break; + case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P) && !host) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; + + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: + return 1; + } + *pdata = data; + return 0; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + switch (msr_info->index) { + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: + case MSR_AMD64_SYSCFG: + case MSR_K8_TSEG_ADDR: + case MSR_K8_TSEG_MASK: + case MSR_VM_HSAVE_PA: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: + case MSR_AMD64_BU_CFG2: + case MSR_IA32_PERF_CTL: + case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: + case MSR_F15H_EX_CFG: + /* + * Intel Sandy Bridge CPUs must support the RAPL (running average power + * limit) MSRs. Just return 0, as we do not want to expose the host + * data here. Do not conditionalize this on CPUID, as KVM does not do + * so for existing CPU-specific MSRs. + */ + case MSR_RAPL_POWER_UNIT: + case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ + case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ + case MSR_PKG_ENERGY_STATUS: /* Total package */ + case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ + msr_info->data = 0; + break; + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info); + msr_info->data = 0; + break; + case MSR_IA32_UCODE_REV: + msr_info->data = vcpu->arch.microcode_version; + break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; + msr_info->data = vcpu->arch.arch_capabilities; + break; + case MSR_IA32_PERF_CAPABILITIES: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; + msr_info->data = vcpu->arch.perf_capabilities; + break; + case MSR_IA32_POWER_CTL: + msr_info->data = vcpu->arch.msr_ia32_power_ctl; + break; + case MSR_IA32_TSC: { + /* + * Intel SDM states that MSR_IA32_TSC read adds the TSC offset + * even when not intercepted. AMD manual doesn't explicitly + * state this but appears to behave the same. + * + * On userspace reads and writes, however, we unconditionally + * return L1's TSC value to ensure backwards-compatible + * behavior for migration. + */ + u64 offset, ratio; + + if (msr_info->host_initiated) { + offset = vcpu->arch.l1_tsc_offset; + ratio = vcpu->arch.l1_tsc_scaling_ratio; + } else { + offset = vcpu->arch.tsc_offset; + ratio = vcpu->arch.tsc_scaling_ratio; + } + + msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; + break; + } + case MSR_IA32_CR_PAT: + msr_info->data = vcpu->arch.pat; + break; + case MSR_MTRRcap: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); + case 0xcd: /* fsb frequency */ + msr_info->data = 3; + break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + msr_info->data = 1 << 24; + break; + case MSR_IA32_APICBASE: + msr_info->data = vcpu->arch.apic_base; + break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); + case MSR_IA32_TSC_DEADLINE: + msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); + break; + case MSR_IA32_TSC_ADJUST: + msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + break; + case MSR_IA32_MISC_ENABLE: + msr_info->data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_SMBASE: + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) + return 1; + msr_info->data = vcpu->arch.smbase; + break; + case MSR_SMI_COUNT: + msr_info->data = vcpu->arch.smi_count; + break; + case MSR_IA32_PERF_STATUS: + /* TSC increment by tick */ + msr_info->data = 1000ULL; + /* CPU multiplier */ + msr_info->data |= (((uint64_t)4ULL) << 40); + break; + case MSR_EFER: + msr_info->data = vcpu->arch.efer; + break; + case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.time; + break; + case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.time; + break; + case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.apf.msr_en_val; + break; + case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.apf.msr_int_val; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = 0; + break; + case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.st.msr_val; + break; + case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.pv_eoi.msr_val; + break; + case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return KVM_MSR_RET_UNSUPPORTED; + + msr_info->data = vcpu->arch.msr_kvm_poll_control; + break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + return get_msr_mce(vcpu, msr_info->index, &msr_info->data, + msr_info->host_initiated); + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + msr_info->data = vcpu->arch.ia32_xss; + break; + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + msr_info->data = 0x20000000; + break; +#ifdef CONFIG_KVM_HYPERV + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return kvm_hv_get_msr_common(vcpu, + msr_info->index, &msr_info->data, + msr_info->host_initiated); +#endif + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't fully documented in current + * silicon. It is however accessed by winxp in very narrow + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + msr_info->data = 0xbe702111; + break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) + return 1; + msr_info->data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) + return 1; + msr_info->data = vcpu->arch.osvw.status; + break; + case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated && + !vcpu->kvm->arch.guest_can_read_msr_platform_info) + return 1; + msr_info->data = vcpu->arch.msr_platform_info; + break; + case MSR_MISC_FEATURES_ENABLES: + msr_info->data = vcpu->arch.msr_misc_features_enables; + break; + case MSR_K7_HWCR: + msr_info->data = vcpu->arch.msr_hwcr; + break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.xfd_err; + break; +#endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_get_xstate_msr(vcpu, msr_info); + break; + default: + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info); + + return KVM_MSR_RET_UNSUPPORTED; + } + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); + +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_get_msr_ignored_check(vcpu, index, data, true); +} + +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + u64 val; + + /* + * Reject writes to immutable feature MSRs if the vCPU model is frozen, + * as KVM doesn't support modifying the guest vCPU model on the fly, + * e.g. changing the VMX capabilities MSRs while L2 is active is + * nonsensical. Allow writes of the same value, e.g. so that userspace + * can blindly stuff all MSRs when emulating RESET. + */ + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) && + kvm_is_immutable_feature_msr(index) && + (do_get_msr(vcpu, index, &val) || *data != val)) + return -EINVAL; + + return kvm_set_msr_ignored_check(vcpu, index, *data, true); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + bool fpu_loaded = false; + int i; + + for (i = 0; i < msrs->nmsrs; ++i) { + /* + * If userspace is accessing one or more XSTATE-managed MSRs, + * temporarily load the guest's FPU state so that the guest's + * MSR value(s) is resident in hardware and thus can be accessed + * via RDMSR/WRMSR. + */ + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { + kvm_load_guest_fpu(vcpu); + fpu_loaded = true; + } + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + } + if (fpu_loaded) + kvm_put_guest_fpu(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + unsigned size; + int r; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = memdup_user(user_msrs->entries, size); + if (IS_ERR(entries)) { + r = PTR_ERR(entries); + goto out; + } + + r = __msr_io(vcpu, &msrs, entries, do_msr); + + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + r = -EFAULT; + + kfree(entries); +out: + return r; +} + +int kvm_get_feature_msrs(struct kvm_msrs __user *user_msrs) +{ + return msr_io(NULL, user_msrs, do_get_feature_msr, 1); +} + +int kvm_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs) +{ + guard(srcu)(&vcpu->kvm->srcu); + + return msr_io(vcpu, user_msrs, do_get_msr, 1); +} + +int kvm_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs) +{ + guard(srcu)(&vcpu->kvm->srcu); + + return msr_io(vcpu, user_msrs, do_set_msr, 0); +} + +static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (do_get_msr(vcpu, msr, &val)) + return -EINVAL; + + if (put_user(val, user_val)) + return -EFAULT; + + return 0; +} + +static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (get_user(val, user_val)) + return -EFAULT; + + if (do_set_msr(vcpu, msr, &val)) + return -EINVAL; + + return 0; +} + +struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd1; + __u8 rsvd2:4; + __u8 size:4; + __u8 x86; +}; + +static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, + struct kvm_x86_reg_id *reg) +{ + switch (reg->index) { + case KVM_REG_GUEST_SSP: + /* + * FIXME: If host-initiated accesses are ever exempted from + * ignore_msrs (in kvm_do_msr_access()), drop this manual check + * and rely on KVM's standard checks to reject accesses to regs + * that don't exist. + */ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return -EINVAL; + + reg->type = KVM_X86_REG_TYPE_MSR; + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; + break; + default: + return -EINVAL; + } + return 0; +} + +int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp) +{ + struct kvm_one_reg one_reg; + struct kvm_x86_reg_id *reg; + u64 __user *user_val; + bool load_fpu; + int r; + + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) + return -EFAULT; + + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + return -EINVAL; + + reg = (struct kvm_x86_reg_id *)&one_reg.id; + if (reg->rsvd1 || reg->rsvd2) + return -EINVAL; + + if (reg->type == KVM_X86_REG_TYPE_KVM) { + r = kvm_translate_kvm_reg(vcpu, reg); + if (r) + return r; + } + + if (reg->type != KVM_X86_REG_TYPE_MSR) + return -EINVAL; + + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) + return -EINVAL; + + guard(srcu)(&vcpu->kvm->srcu); + + load_fpu = is_xstate_managed_msr(vcpu, reg->index); + if (load_fpu) + kvm_load_guest_fpu(vcpu); + + user_val = u64_to_user_ptr(one_reg.addr); + if (ioctl == KVM_GET_ONE_REG) + r = kvm_get_one_msr(vcpu, reg->index, user_val); + else + r = kvm_set_one_msr(vcpu, reg->index, user_val); + + if (load_fpu) + kvm_put_guest_fpu(vcpu); + return r; +} + +int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list) +{ + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; + u64 user_nr_regs; + + if (get_user(user_nr_regs, &user_list->n)) + return -EFAULT; + + if (put_user(nr_regs, &user_list->n)) + return -EFAULT; + + if (user_nr_regs < nr_regs) + return -E2BIG; + + if (nr_regs && + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) + return -EFAULT; + + return 0; +} + +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc_obj(*msr_filter, GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) +{ + u32 i; + + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); + + kfree(msr_filter); +} + +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) +{ + unsigned long *bitmap; + size_t bitmap_size; + + if (!user_range->nmsrs) + return 0; + + if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) + return -EINVAL; + + if (!user_range->flags) + return -EINVAL; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + msr_filter->count++; + return 0; +} + +int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_msr_filter *filter) +{ + struct kvm_x86_msr_filter *new_filter, *old_filter; + bool default_allow; + bool empty = true; + int r; + u32 i; + + if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; + + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } + } + + mutex_lock(&kvm->lock); + old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + + /* + * Recalc MSR intercepts as userspace may want to intercept accesses to + * MSRs that KVM would otherwise pass through to the guest. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); + + return 0; +} + + +static void kvm_probe_feature_msr(u32 msr_index) +{ + u64 data; + + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) + return; + + msr_based_features[num_msr_based_features++] = msr_index; +} + +static void kvm_probe_msr_to_save(u32 msr_index) +{ + u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... + MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... + MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_FIXED_CTR0 ... + MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1: + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= + kvm_pmu_cap.num_counters_fixed) + return; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + case MSR_IA32_TSX_CTRL: + if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) + return; + break; + case MSR_IA32_XSS: + if (!kvm_caps.supported_xss) + return; + break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + return; + break; + case MSR_IA32_INT_SSP_TAB: + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) + return; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +void kvm_init_msr_lists(void) +{ + unsigned i; + + BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3, + "Please update the fixed PMCs in msrs_to_save_pmu[]"); + + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); + + if (enable_pmu) { + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); + } + + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_call(has_emulated_msr)(NULL, + emulated_msrs_all[i])) + continue; + + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; + } + + for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++) + kvm_probe_feature_msr(i); + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) + kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]); +} + +int kvm_spec_ctrl_test_value(u64 value) +{ + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ + + u64 saved_value; + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); + + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); diff --git a/arch/x86/kvm/msrs.h b/arch/x86/kvm/msrs.h new file mode 100644 index 000000000000..b698983e37fb --- /dev/null +++ b/arch/x86/kvm/msrs.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_KVM_MSR_H +#define ARCH_X86_KVM_MSR_H + +#include <linux/kvm_host.h> +#include <linux/user-return-notifier.h> + +#include "cpuid.h" +#include "regs.h" + +extern bool report_ignored_msrs; +extern bool ignore_msrs; + +extern u32 __read_mostly kvm_nr_uret_msrs; + +static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + +/* + * The first...last VMX feature MSRs that are emulated by KVM. This may or may + * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an + * associated feature that KVM supports for nested virtualization. + */ +#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC +#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC + +/* + * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves + * are arbitrary and have no meaning, the only requirement is that they don't + * conflict with "real" MSRs that KVM supports. Use values at the upper end + * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values + * will be usable until KVM exhausts its supply of paravirtual MSR indices. + */ +#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff + +#define MSR_IA32_CR_PAT_DEFAULT \ + PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) + +void kvm_init_msr_lists(void); +int kvm_get_msr_index_list(struct kvm_msr_list __user *user_msr_list); +int kvm_get_feature_msr_index_list(struct kvm_msr_list __user *user_msr_list); +int kvm_get_feature_msrs(struct kvm_msrs __user *user_msrs); + +int kvm_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs); +int kvm_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs); + +int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp); +int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list); + +void kvm_enable_efer_bits(u64); +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); + +int kvm_add_user_return_msr(u32 msr); +int kvm_find_user_return_msr(u32 msr); +int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +u64 kvm_get_user_return_msr(unsigned int slot); + +static inline bool kvm_is_supported_user_return_msr(u32 msr) +{ + return kvm_find_user_return_msr(msr) >= 0; +} + +void kvm_user_return_msr_cpu_online(void); +void drop_user_return_notifiers(void); +void kvm_destroy_user_return_msrs(void); + +int kvm_emulator_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index, + u64 *pdata); +int kvm_emulator_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index, + u64 data); +int kvm_emulator_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); + +enum kvm_msr_access { + MSR_TYPE_R = BIT(0), + MSR_TYPE_W = BIT(1), + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, +}; + +/* + * Internal error codes that are used to indicate that MSR emulation encountered + * an error that should result in #GP in the guest, unless userspace handles it. + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM + * as part of KVM's lightly documented internal KVM_RUN return codes. + * + * UNSUPPORTED - The MSR isn't supported, either because it is completely + * unknown to KVM, or because the MSR should not exist according + * to the vCPU model. + * + * FILTERED - Access to the MSR is denied by a userspace MSR filter. + */ +#define KVM_MSR_RET_UNSUPPORTED 2 +#define KVM_MSR_RET_FILTERED 3 + +int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_msr_filter *filter); +void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter); + +int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); + +u64 kvm_get_arch_capabilities(void); +int kvm_spec_ctrl_test_value(u64 value); + +#define CET_US_RESERVED_BITS GENMASK(9, 6) +#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) +#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) +#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12) + +static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) +{ + if (data & CET_US_RESERVED_BITS) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (data & CET_US_SHSTK_MASK_BITS)) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + (data & CET_US_IBT_MASK_BITS)) + return false; + if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4)) + return false; + /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */ + if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR)) + return false; + + return true; +} + +#endif diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 6f74e2b27c1e..c4ec024943bb 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -19,7 +19,7 @@ #include <asm/mtrr.h> #include "cpuid.h" -#include "x86.h" +#include "msrs.h" static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index dd1c57593f48..7f777049d328 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -43,6 +43,18 @@ module_param(enable_pmu, bool, 0444); bool __read_mostly enable_mediated_pmu; EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); +struct kvm_x86_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 nr_includes; + __u32 nr_excludes; + __u64 *includes; + __u64 *excludes; + __u64 events[] __counted_by(nevents); +}; + struct kvm_pmu_emulated_event_selectors { u64 INSTRUCTIONS_RETIRED; u64 BRANCH_INSTRUCTIONS_RETIRED; diff --git a/arch/x86/kvm/regs.c b/arch/x86/kvm/regs.c new file mode 100644 index 000000000000..bd8147798cc3 --- /dev/null +++ b/arch/x86/kvm/regs.c @@ -0,0 +1,874 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm_host.h> + +#include "lapic.h" +#include "mmu.h" +#include "regs.h" +#include "x86.h" + +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) +{ + /* Can't read the RIP when guest state is protected, just return 0 */ + if (vcpu->arch.guest_state_protected) + return 0; + + if (is_64_bit_mode(vcpu)) + return kvm_rip_read(vcpu); + return (u32)(kvm_get_segment_base(vcpu, VCPU_SREG_CS) + + kvm_rip_read(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip); + +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +{ + return kvm_get_linear_rip(vcpu) == linear_rip; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip); + +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) +{ + unsigned long rflags; + + rflags = kvm_x86_call(get_rflags)(vcpu); + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + rflags &= ~X86_EFLAGS_TF; + return rflags; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags); + +void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) + rflags |= X86_EFLAGS_TF; + kvm_x86_call(set_rflags)(vcpu, rflags); +} + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + __kvm_set_rflags(vcpu, rflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); + +static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { + /* + * We are here if userspace calls get_regs() in the middle of + * instruction emulation. Registers state needs to be copied + * back from emulation context to vcpu. Userspace shouldn't do + * that usually, but some bad designed PV devices (vmware + * backdoor interface) need this to work + */ + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + } + regs->rax = kvm_rax_read_raw(vcpu); + regs->rbx = kvm_rbx_read_raw(vcpu); + regs->rcx = kvm_rcx_read_raw(vcpu); + regs->rdx = kvm_rdx_read_raw(vcpu); + regs->rsi = kvm_rsi_read_raw(vcpu); + regs->rdi = kvm_rdi_read_raw(vcpu); + regs->rsp = kvm_rsp_read(vcpu); + regs->rbp = kvm_rbp_read_raw(vcpu); +#ifdef CONFIG_X86_64 + regs->r8 = kvm_r8_read_raw(vcpu); + regs->r9 = kvm_r9_read_raw(vcpu); + regs->r10 = kvm_r10_read_raw(vcpu); + regs->r11 = kvm_r11_read_raw(vcpu); + regs->r12 = kvm_r12_read_raw(vcpu); + regs->r13 = kvm_r13_read_raw(vcpu); + regs->r14 = kvm_r14_read_raw(vcpu); + regs->r15 = kvm_r15_read_raw(vcpu); +#endif + + regs->rip = kvm_rip_read(vcpu); + regs->rflags = kvm_get_rflags(vcpu); +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + vcpu_load(vcpu); + __get_regs(vcpu, regs); + vcpu_put(vcpu); + return 0; +} + +static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + + kvm_rax_write_raw(vcpu, regs->rax); + kvm_rbx_write_raw(vcpu, regs->rbx); + kvm_rcx_write_raw(vcpu, regs->rcx); + kvm_rdx_write_raw(vcpu, regs->rdx); + kvm_rsi_write_raw(vcpu, regs->rsi); + kvm_rdi_write_raw(vcpu, regs->rdi); + kvm_rsp_write(vcpu, regs->rsp); + kvm_rbp_write_raw(vcpu, regs->rbp); +#ifdef CONFIG_X86_64 + kvm_r8_write_raw(vcpu, regs->r8); + kvm_r9_write_raw(vcpu, regs->r9); + kvm_r10_write_raw(vcpu, regs->r10); + kvm_r11_write_raw(vcpu, regs->r11); + kvm_r12_write_raw(vcpu, regs->r12); + kvm_r13_write_raw(vcpu, regs->r13); + kvm_r14_write_raw(vcpu, regs->r14); + kvm_r15_write_raw(vcpu, regs->r15); +#endif + + kvm_rip_write(vcpu, regs->rip); + kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception_vmexit.pending = false; + + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + vcpu_load(vcpu); + __set_regs(vcpu, regs); + vcpu_put(vcpu); + return 0; +} + +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); +} + +/* + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. + */ +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct kvm_pagewalk *w = &vcpu->arch.gva_walk; + gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; + gpa_t real_gpa; + int i; + int ret; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK | + PFERR_GUEST_PAGE_MASK, NULL, 0); + if (real_gpa == INVALID_GPA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { + if ((pdpte[i] & PT_PRESENT_MASK) && + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { + return 0; + } + } + + /* + * Marking VCPU_REG_PDPTR dirty doesn't work for !tdp_enabled. + * Shadow page roots need to be reconstructed instead. + */ + if (!tdp_enabled && memcmp(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs))) + kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.root_mmu, + KVM_MMU_ROOT_CURRENT); + + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); + vcpu->arch.pdptrs_from_userspace = false; + + return 1; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs); + +static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) + return false; +#endif + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return false; + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return false; + + return kvm_x86_call(is_valid_cr0)(vcpu, cr0); +} + +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) +{ + /* + * CR0.WP is incorporated into the MMU role, but only for non-nested, + * indirect shadow MMUs. If paging is disabled, no updates are needed + * as there are no permission bits to emulate. If TDP is enabled, the + * MMU's metadata needs to be updated, e.g. so that emulating guest + * translations does the right thing, but there's no need to unload the + * root as CR0.WP doesn't affect SPTEs. + */ + if ((cr0 ^ old_cr0) == X86_CR0_WP) { + if (!(cr0 & X86_CR0_PG)) + return; + + if (tdp_enabled) { + kvm_init_mmu(vcpu); + return; + } + } + + if ((cr0 ^ old_cr0) & X86_CR0_PG) { + /* + * Clearing CR0.PG is defined to flush the TLB from the guest's + * perspective. + */ + if (!(cr0 & X86_CR0_PG)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + /* + * Check for async #PF completion events when enabling paging, + * as the vCPU may have previously encountered async #PFs (it's + * entirely legal for the guest to toggle paging on/off without + * waiting for the async #PF queue to drain). + */ + else if (kvm_pv_async_pf_enabled(vcpu)) + kvm_make_request(KVM_REQ_APF_READY, vcpu); + } + + if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) + kvm_mmu_reset_context(vcpu); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0); + +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + unsigned long old_cr0 = kvm_read_cr0(vcpu); + + if (!kvm_is_valid_cr0(vcpu, cr0)) + return 1; + + cr0 |= X86_CR0_ET; + + /* Write to CR0 reserved bits are ignored, even on Intel. */ + cr0 &= ~CR0_RESERVED_BITS; + +#ifdef CONFIG_X86_64 + if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && + (cr0 & X86_CR0_PG)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) + return 1; + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + if (cs_l) + return 1; + } +#endif + if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && + is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && + !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) + return 1; + + if (!(cr0 & X86_CR0_PG) && + (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) + return 1; + + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) + return 1; + + kvm_x86_call(set_cr0)(vcpu, cr0); + + kvm_post_set_cr0(vcpu, old_cr0, cr0); + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0); + +void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); + +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + bool skip_tlb_flush = false; + unsigned long pcid = 0; +#ifdef CONFIG_X86_64 + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { + skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; + cr3 &= ~X86_CR3_PCID_NOFLUSH; + pcid = cr3 & X86_CR3_PCID_MASK; + } +#endif + + /* PDPTRs are always reloaded for PAE paging. */ + if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) + goto handle_tlb_flush; + + /* + * Do not condition the GPA check on long mode, this helper is used to + * stuff CR3, e.g. for RSM emulation, and there is no guarantee that + * the current vCPU mode is accurate. + */ + if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) + return 1; + + if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + return 1; + + if (cr3 != kvm_read_cr3(vcpu)) + kvm_mmu_new_pgd(vcpu, cr3); + + vcpu->arch.cr3 = cr3; + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ + +handle_tlb_flush: + /* + * A load of CR3 that flushes the TLB flushes only the current PCID, + * even if PCID is disabled, in which case PCID=0 is flushed. It's a + * moot point in the end because _disabling_ PCID will flush all PCIDs, + * and it's impossible to use a non-zero PCID when PCID is disabled, + * i.e. only PCID=0 can be relevant. + */ + if (!skip_tlb_flush) + kvm_invalidate_pcid(vcpu, pcid); + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + kvm_x86_call(is_valid_cr4)(vcpu, cr4); +} + +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) +{ + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) + kvm_mmu_reset_context(vcpu); + + /* + * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB + * according to the SDM; however, stale prev_roots could be reused + * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we + * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST + * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, + * so fall through. + */ + if (!tdp_enabled && + (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) + kvm_mmu_unload(vcpu); + + /* + * The TLB has to be flushed for all PCIDs if any of the following + * (architecturally required) changes happen: + * - CR4.PCIDE is changed from 1 to 0 + * - CR4.PGE is toggled + * + * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. + */ + if (((cr4 ^ old_cr4) & X86_CR4_PGE) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + + /* + * The TLB has to be flushed for the current PCID if any of the + * following (architecturally required) changes happen: + * - CR4.SMEP is changed from 0 to 1 + * - CR4.PAE is toggled + */ + else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || + ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4); + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + + if (!kvm_is_valid_cr4(vcpu, cr4)) + return 1; + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) + return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; + } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) + && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) + && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) + return 1; + + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + return 1; + } + + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) + return 1; + + kvm_x86_call(set_cr4)(vcpu, cr4); + + kvm_post_set_cr4(vcpu, old_cr4, cr4); + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4); + +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) + return 1; + if (lapic_in_kernel(vcpu)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->arch.cr8 = cr8; + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8); + +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) +{ + if (lapic_in_kernel(vcpu)) + return kvm_lapic_get_cr8(vcpu); + else + return vcpu->arch.cr8; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); + +static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct desc_ptr dt; + + if (vcpu->arch.guest_state_protected) + goto skip_protected_regs; + + kvm_handle_exception_payload_quirk(vcpu); + + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_call(get_idt)(vcpu, &dt); + sregs->idt.limit = dt.size; + sregs->idt.base = dt.address; + kvm_x86_call(get_gdt)(vcpu, &dt); + sregs->gdt.limit = dt.size; + sregs->gdt.base = dt.address; + + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = kvm_read_cr3(vcpu); + +skip_protected_regs: + sregs->cr0 = kvm_read_cr0(vcpu); + sregs->cr4 = kvm_read_cr4(vcpu); + sregs->cr8 = kvm_get_cr8(vcpu); + sregs->efer = vcpu->arch.efer; + sregs->apic_base = vcpu->arch.apic_base; +} + +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + __get_sregs_common(vcpu, sregs); + + if (vcpu->arch.guest_state_protected) + return; + + if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) + set_bit(vcpu->arch.interrupt.nr, + (unsigned long *)sregs->interrupt_bitmap); +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + vcpu_load(vcpu); + __get_sregs(vcpu, sregs); + vcpu_put(vcpu); + return 0; +} + +void kvm_vcpu_ioctl_x86_get_sregs2(struct kvm_vcpu *vcpu, + struct kvm_sregs2 *sregs2) +{ + int i; + + __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2); + + if (vcpu->arch.guest_state_protected) + return; + + if (is_pae_paging(vcpu)) { + kvm_vcpu_srcu_read_lock(vcpu); + for (i = 0 ; i < 4 ; i++) + sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); + sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + kvm_vcpu_srcu_read_unlock(vcpu); + } +} + +static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { + /* + * When EFER.LME and CR0.PG are set, the processor is in + * 64-bit mode (though maybe in a 32-bit code segment). + * CR4.PAE and EFER.LMA must be set. + */ + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return false; + if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3)) + return false; + } else { + /* + * Not in 64-bit mode: EFER.LMA is clear and the code + * segment cannot be 64-bit. + */ + if (sregs->efer & EFER_LMA || sregs->cs.l) + return false; + } + + return kvm_is_valid_cr4(vcpu, sregs->cr4) && + kvm_is_valid_cr0(vcpu, sregs->cr0); +} + +static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, + int *mmu_reset_needed, bool update_pdptrs) +{ + int idx; + struct desc_ptr dt; + + if (!kvm_is_valid_sregs(vcpu, sregs)) + return -EINVAL; + + if (kvm_apic_set_base(vcpu, sregs->apic_base, true)) + return -EINVAL; + + if (vcpu->arch.guest_state_protected) + return 0; + + dt.size = sregs->idt.limit; + dt.address = sregs->idt.base; + kvm_x86_call(set_idt)(vcpu, &dt); + dt.size = sregs->gdt.limit; + dt.address = sregs->gdt.base; + kvm_x86_call(set_gdt)(vcpu, &dt); + + vcpu->arch.cr2 = sregs->cr2; + *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; + kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); + kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); + + *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + kvm_x86_call(set_efer)(vcpu, sregs->efer); + + *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + kvm_x86_call(set_cr0)(vcpu, sregs->cr0); + + *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + kvm_x86_call(set_cr4)(vcpu, sregs->cr4); + + if (update_pdptrs) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (is_pae_paging(vcpu)) { + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); + *mmu_reset_needed = 1; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_set_cr8(vcpu, sregs->cr8); + + /* Older userspace won't unhalt the vcpu on reset. */ + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !is_protmode(vcpu)) + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); + + return 0; +} + +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + int pending_vec, max_bits; + int mmu_reset_needed = 0; + int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true); + + if (ret) + return ret; + + if (mmu_reset_needed) { + kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } + + max_bits = KVM_NR_INTERRUPTS; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int ret; + + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + vcpu_load(vcpu); + ret = __set_sregs(vcpu, sregs); + vcpu_put(vcpu); + return ret; +} + +int kvm_vcpu_ioctl_x86_set_sregs2(struct kvm_vcpu *vcpu, + struct kvm_sregs2 *sregs2) +{ + int mmu_reset_needed = 0; + bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; + bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && + !(sregs2->efer & EFER_LMA); + int i, ret; + + if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) + return -EINVAL; + + if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) + return -EINVAL; + + ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2, + &mmu_reset_needed, !valid_pdptrs); + if (ret) + return ret; + + if (valid_pdptrs) { + for (i = 0; i < 4 ; i++) + kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); + + kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR); + mmu_reset_needed = 1; + vcpu->arch.pdptrs_from_userspace = true; + } + if (mmu_reset_needed) { + kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } + return 0; +} + +void kvm_run_sync_regs_to_user(struct kvm_vcpu *vcpu) +{ + BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES); + + if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS) + __get_regs(vcpu, &vcpu->run->s.regs.regs); + + if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS) + __get_sregs(vcpu, &vcpu->run->s.regs.sregs); +} + +int kvm_run_sync_regs_from_user(struct kvm_vcpu *vcpu) +{ + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { + __set_regs(vcpu, &vcpu->run->s.regs.regs); + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; + } + + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { + struct kvm_sregs sregs = vcpu->run->s.regs.sregs; + + if (__set_sregs(vcpu, &sregs)) + return -EINVAL; + + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; + } + + return 0; +} + +void kvm_update_dr0123(struct kvm_vcpu *vcpu) +{ + int i; + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } +} + +void kvm_update_dr7(struct kvm_vcpu *vcpu) +{ + unsigned long dr7; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + dr7 = vcpu->arch.guest_debug_dr7; + else + dr7 = vcpu->arch.dr7; + kvm_x86_call(set_dr7)(vcpu, dr7); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); + +static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) +{ + u64 fixed = DR6_FIXED_1; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)) + fixed |= DR6_RTM; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + fixed |= DR6_BUS_LOCK; + return fixed; +} + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + size_t size = ARRAY_SIZE(vcpu->arch.db); + + switch (dr) { + case 0 ... 3: + vcpu->arch.db[array_index_nospec(dr, size)] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4: + case 6: + if (!kvm_dr6_valid(val)) + return 1; /* #GP */ + vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); + break; + case 5: + default: /* 7 */ + if (!kvm_dr7_valid(val)) + return 1; /* #GP */ + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + kvm_update_dr7(vcpu); + break; + } + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr); + +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) +{ + size_t size = ARRAY_SIZE(vcpu->arch.db); + + switch (dr) { + case 0 ... 3: + return vcpu->arch.db[array_index_nospec(dr, size)]; + case 4: + case 6: + return vcpu->arch.dr6; + case 5: + default: /* 7 */ + return vcpu->arch.dr7; + } +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); + +int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + unsigned int i; + + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + kvm_handle_exception_payload_quirk(vcpu); + + memset(dbgregs, 0, sizeof(*dbgregs)); + + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + dbgregs->db[i] = vcpu->arch.db[i]; + + dbgregs->dr6 = vcpu->arch.dr6; + dbgregs->dr7 = vcpu->arch.dr7; + return 0; +} + +int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + unsigned int i; + + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + if (dbgregs->flags) + return -EINVAL; + + if (!kvm_dr6_valid(dbgregs->dr6)) + return -EINVAL; + if (!kvm_dr7_valid(dbgregs->dr7)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + vcpu->arch.db[i] = dbgregs->db[i]; + + kvm_update_dr0123(vcpu); + vcpu->arch.dr6 = dbgregs->dr6; + vcpu->arch.dr7 = dbgregs->dr7; + kvm_update_dr7(vcpu); + + return 0; +} diff --git a/arch/x86/kvm/regs.h b/arch/x86/kvm/regs.h index 5bda738afb7c..94fd86728fed 100644 --- a/arch/x86/kvm/regs.h +++ b/arch/x86/kvm/regs.h @@ -16,6 +16,18 @@ static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS)); +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); + static inline bool is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 @@ -397,6 +409,14 @@ static inline bool kvm_dr6_valid(u64 data) return !(data >> 32); } +static inline unsigned long kvm_get_effective_dr7(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + return vcpu->arch.guest_debug_dr7; + + return vcpu->arch.dr7; +} + static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; @@ -420,4 +440,44 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } +static inline unsigned long kvm_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_x86_call(get_segment_base)(vcpu, seg); +} + +static inline void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_call(set_segment)(vcpu, var, seg); +} + +static inline void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_call(get_segment)(vcpu, var, seg); +} + +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); + +unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); +void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); + +void kvm_vcpu_ioctl_x86_get_sregs2(struct kvm_vcpu *vcpu, + struct kvm_sregs2 *sregs2); +int kvm_vcpu_ioctl_x86_set_sregs2(struct kvm_vcpu *vcpu, + struct kvm_sregs2 *sregs2); + +void kvm_run_sync_regs_to_user(struct kvm_vcpu *vcpu); +int kvm_run_sync_regs_from_user(struct kvm_vcpu *vcpu); + +void kvm_update_dr0123(struct kvm_vcpu *vcpu); +void kvm_update_dr7(struct kvm_vcpu *vcpu); +int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs); +int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs); + + #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3e6c671a8dc2..ba985a02208a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -23,6 +23,7 @@ #include "kvm_emulate.h" #include "trace.h" +#include "irq.h" #include "mmu.h" #include "x86.h" #include "smm.h" @@ -112,16 +113,15 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) svm->vmcb01.ptr->save.efer, svm->nested.ctl.nested_cr3, svm->nested.ctl.misc_ctl); - vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; - vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; - vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + vcpu->arch.ngpa_walk.get_guest_pgd = nested_svm_get_tdp_cr3; + vcpu->arch.ngpa_walk.get_pdptr = nested_svm_get_tdp_pdptr; + vcpu->arch.ngpa_walk.inject_page_fault = nested_svm_inject_npf_exit; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) { vcpu->arch.mmu = &vcpu->arch.root_mmu; - vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) @@ -2150,7 +2150,7 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 pte_access) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_pagewalk *w = &vcpu->arch.ngpa_walk; if (WARN_ON_ONCE(!mmu_is_nested(vcpu))) return gpa; @@ -2159,7 +2159,7 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET)) access |= PFERR_USER_MASK; - return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); + return w->gva_to_gpa(vcpu, w, gpa, access, exception); } struct kvm_x86_nested_ops svm_nested_ops = { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4d2bacd00ec4..ba4ac1d860fd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4055,12 +4055,12 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); - if (vcpu->arch.nested_run_pending) - return -EBUSY; - if (svm_interrupt_blocked(vcpu)) return 0; + if (vcpu->arch.nested_run_pending) + return -EBUSY; + /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h index 3f9150125e70..117bf8bec07d 100644 --- a/arch/x86/kvm/tss.h +++ b/arch/x86/kvm/tss.h @@ -57,4 +57,11 @@ struct tss_segment_16 { u16 ldt; }; +#define TSS_IOPB_BASE_OFFSET 0x66 +#define TSS_BASE_SIZE 0x68 +#define TSS_IOPB_SIZE (65536 / 8) +#define TSS_REDIRECTION_SIZE (256 / 8) +#define RMODE_TSS_SIZE \ + (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) + #endif diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6957bb6f5cf7..0635e92471c8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -11,6 +11,7 @@ #include "x86.h" #include "cpuid.h" #include "hyperv.h" +#include "irq.h" #include "mmu.h" #include "nested.h" #include "pmu.h" @@ -407,7 +408,7 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, roots |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots) - kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); + kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.ngpa_walk, addr, roots); } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, @@ -511,17 +512,15 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.guest_mmu; nested_ept_new_eptp(vcpu); - vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; - vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; - vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; + vcpu->arch.ngpa_walk.get_guest_pgd = nested_ept_get_eptp; + vcpu->arch.ngpa_walk.get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + vcpu->arch.ngpa_walk.inject_page_fault = nested_ept_inject_page_fault; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) { vcpu->arch.mmu = &vcpu->arch.root_mmu; - vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, @@ -7463,12 +7462,13 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) return 0; } + static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception, u64 pte_access) { - struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_pagewalk *w = &vcpu->arch.ngpa_walk; if (WARN_ON_ONCE(!mmu_is_nested(vcpu))) return gpa; @@ -7481,7 +7481,7 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK)) access |= PFERR_USER_MASK; - return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); + return w->gva_to_gpa(vcpu, w, gpa, access, exception); } struct kvm_x86_nested_ops vmx_nested_ops = { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cc75feec05da..a03add00f923 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -72,6 +72,7 @@ #include "x86.h" #include "x86_ops.h" #include "smm.h" +#include "tss.h" #include "vmx_onhyperv.h" #include "vmenter.h" #include "posted_intr.h" @@ -1186,6 +1187,18 @@ static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); } +static u16 vmx_store_ldt(void) +{ + u16 ldt; + asm("sldt %0" : "=g"(ldt)); + return ldt; +} + +static void vmx_load_ldt(u16 sel) +{ + asm("lldt %0" : : "rm"(sel)); +} + #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the @@ -1203,7 +1216,7 @@ static unsigned long segment_base(u16 selector) table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u16 ldt_selector = kvm_read_ldt(); + u16 ldt_selector = vmx_store_ldt(); if (!(ldt_selector & ~SEGMENT_RPL_MASK)) return 0; @@ -1358,7 +1371,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - host_state->ldt_sel = kvm_read_ldt(); + host_state->ldt_sel = vmx_store_ldt(); #ifdef CONFIG_X86_64 savesegment(ds, host_state->ds_sel); @@ -1405,7 +1418,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { - kvm_load_ldt(host_state->ldt_sel); + vmx_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 load_gs_index(host_state->gs_sel); #else @@ -5238,6 +5251,9 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { + if (vmx_interrupt_blocked(vcpu)) + return 0; + if (vcpu->arch.nested_run_pending) return -EBUSY; @@ -5248,7 +5264,7 @@ int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return -EBUSY; - return !vmx_interrupt_blocked(vcpu); + return 1; } int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -8703,7 +8719,7 @@ __init int vmx_hardware_setup(void) /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID - * bits to shadow_zero_check. + * bits into the MMU's struct kvm_page_format. */ vmx_setup_me_spte_mask(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index afcac1042947..0626e835e9eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -80,7 +80,6 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> -#include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <asm/sgx.h> #include <asm/virt.h> @@ -90,8 +89,6 @@ #define CREATE_TRACE_POINTS #include "trace.h" -#define MAX_IO_MSRS 256 - /* * Note, kvm_caps fields should *never* have default values, all fields must be * recomputed from scratch during vendor module load, e.g. to account for a @@ -108,17 +105,12 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host); #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) -/* EFER defaults: - * - enable syscall per default because its emulated by KVM - * - enable LME and LMA per default on 64 bit KVM +/* + * KVM previously used a u32 field in kvm_run to indicate the hypercall was + * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the + * remaining 31 lower bits must be 0 to preserve ABI. */ -#ifdef CONFIG_X86_64 -static -u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); -#else -static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); -#endif - +#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE @@ -128,17 +120,38 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \ KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) +#define KVM_CLOCK_VALID_FLAGS \ + (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) + +#define KVM_X86_VALID_QUIRKS \ + (KVM_X86_QUIRK_LINT0_REENABLED | \ + KVM_X86_QUIRK_CD_NW_CLEARED | \ + KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ + KVM_X86_QUIRK_OUT_7E_INC_RIP | \ + KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ + KVM_X86_QUIRK_SLOT_ZAP_ALL | \ + KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT | \ + KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \ + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT) + +#define KVM_X86_CONDITIONAL_QUIRKS \ + (KVM_X86_QUIRK_CD_NW_CLEARED | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) + +#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ + KVM_BUS_LOCK_DETECTION_EXIT) + +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + static void process_nmi(struct kvm_vcpu *vcpu); -static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); -static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); -static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); - static DEFINE_MUTEX(vendor_module_lock); -static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); struct kvm_x86_ops kvm_x86_ops __read_mostly; @@ -152,13 +165,6 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl); -static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, 0644); - -bool __read_mostly report_ignored_msrs = true; -module_param(report_ignored_msrs, bool, 0644); -EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); - unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); @@ -181,34 +187,10 @@ module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, 0644); -bool __read_mostly eager_page_split = true; -module_param(eager_page_split, bool, 0644); - /* Enable/disable SMT_RSB bug mitigation */ static bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); -/* - * Restoring the host value for MSRs that are only consumed when running in - * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU - * returns to userspace, i.e. the kernel can run with the guest's value. - */ -#define KVM_MAX_NR_USER_RETURN_MSRS 16 - -struct kvm_user_return_msrs { - struct user_return_notifier urn; - bool registered; - struct kvm_user_return_msr_values { - u64 host; - u64 curr; - } values[KVM_MAX_NR_USER_RETURN_MSRS]; -}; - -u32 __read_mostly kvm_nr_uret_msrs; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); -static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; -static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); - #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ @@ -311,249 +293,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { static struct kmem_cache *x86_emulator_cache; -/* - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds - * MSRs that KVM emulates without strictly requiring host support. - * msr_based_features holds MSRs that enumerate features, i.e. are effectively - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with - * msrs_to_save and emulated_msrs. - */ - -static const u32 msrs_to_save_base[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, - MSR_IA32_UMWAIT_CONTROL, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, - - MSR_IA32_U_CET, MSR_IA32_S_CET, - MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, - MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, - MSR_IA32_DEBUGCTLMSR, - MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP, - MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, -}; - -static const u32 msrs_to_save_pmu[] = { - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, - MSR_CORE_PERF_GLOBAL_CTRL, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, - - /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, - - /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, -}; - -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + - ARRAY_SIZE(msrs_to_save_pmu)]; -static unsigned num_msrs_to_save; - -static const u32 emulated_msrs_all[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - -#ifdef CONFIG_KVM_HYPERV - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_VP_ASSIST_PAGE, - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, - HV_X64_MSR_SYNDBG_OPTIONS, - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, - HV_X64_MSR_SYNDBG_PENDING_BUFFER, -#endif - - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSC_DEADLINE, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, - MSR_SMI_COUNT, - MSR_PLATFORM_INFO, - MSR_MISC_FEATURES_ENABLES, - MSR_AMD64_VIRT_SPEC_CTRL, - MSR_AMD64_TSC_RATIO, - MSR_IA32_POWER_CTL, - MSR_IA32_UCODE_REV, - - /* - * KVM always supports the "true" VMX control MSRs, even if the host - * does not. The VMX MSRs as a whole are considered "emulated" as KVM - * doesn't strictly require them to exist in the host (ignoring that - * KVM would refuse to load in the first place if the core set of MSRs - * aren't supported). - */ - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - - MSR_K7_HWCR, - MSR_KVM_POLL_CONTROL, -}; - -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; -static unsigned num_emulated_msrs; - -/* - * List of MSRs that control the existence of MSR-based features, i.e. MSRs - * that are effectively CPUID leafs. VMX MSRs are also included in the set of - * feature MSRs, but are handled separately to allow expedited lookups. - */ -static const u32 msr_based_features_all_except_vmx[] = { - MSR_AMD64_DE_CFG, - MSR_IA32_UCODE_REV, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, - MSR_PLATFORM_INFO, -}; - -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; -static unsigned int num_msr_based_features; - -/* - * All feature MSRs except uCode revID, which tracks the currently loaded uCode - * patch, are immutable once the vCPU model is defined. - */ -static bool kvm_is_immutable_feature_msr(u32 msr) -{ - int i; - - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) - return true; - - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { - if (msr == msr_based_features_all_except_vmx[i]) - return msr != MSR_IA32_UCODE_REV; - } - - return false; -} - -static bool kvm_is_advertised_msr(u32 msr_index) -{ - unsigned int i; - - for (i = 0; i < num_msrs_to_save; i++) { - if (msrs_to_save[i] == msr_index) - return true; - } - - for (i = 0; i < num_emulated_msrs; i++) { - if (emulated_msrs[i] == msr_index) - return true; - } - - return false; -} - -typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated); - -static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, - u64 *data, bool host_initiated, - enum kvm_msr_access rw, - msr_access_t msr_access_fn) -{ - const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; - int ret; - - BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); - - /* - * Zero the data on read failures to avoid leaking stack data to the - * guest and/or userspace, e.g. if the failure is ignored below. - */ - ret = msr_access_fn(vcpu, msr, data, host_initiated); - if (ret && rw == MSR_TYPE_R) - *data = 0; - - if (ret != KVM_MSR_RET_UNSUPPORTED) - return ret; - - /* - * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM - * advertises to userspace, even if an MSR isn't fully supported. - * Simply check that @data is '0', which covers both the write '0' case - * and all reads (in which case @data is zeroed on failure; see above). - */ - if (host_initiated && !*data && kvm_is_advertised_msr(msr)) - return 0; - - if (!ignore_msrs) { - kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", - op, msr, *data); - return ret; - } - - if (report_ignored_msrs) - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); - - return 0; -} - static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); @@ -567,128 +306,6 @@ static struct kmem_cache *kvm_alloc_emulator_cache(void) static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); -static void kvm_destroy_user_return_msrs(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); - - kvm_nr_uret_msrs = 0; -} - -static void kvm_on_user_return(struct user_return_notifier *urn) -{ - unsigned slot; - struct kvm_user_return_msrs *msrs - = container_of(urn, struct kvm_user_return_msrs, urn); - struct kvm_user_return_msr_values *values; - - msrs->registered = false; - user_return_notifier_unregister(urn); - - for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { - values = &msrs->values[slot]; - if (values->host != values->curr) { - wrmsrq(kvm_uret_msrs_list[slot], values->host); - values->curr = values->host; - } - } -} - -static int kvm_probe_user_return_msr(u32 msr) -{ - u64 val; - int ret; - - preempt_disable(); - ret = rdmsrq_safe(msr, &val); - if (ret) - goto out; - ret = wrmsrq_safe(msr, val); -out: - preempt_enable(); - return ret; -} - -int kvm_add_user_return_msr(u32 msr) -{ - BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); - - if (kvm_probe_user_return_msr(msr)) - return -1; - - kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; - return kvm_nr_uret_msrs++; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr); - -int kvm_find_user_return_msr(u32 msr) -{ - int i; - - for (i = 0; i < kvm_nr_uret_msrs; ++i) { - if (kvm_uret_msrs_list[i] == msr) - return i; - } - return -1; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); - -static void kvm_user_return_msr_cpu_online(void) -{ - struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); - u64 value; - int i; - - for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrq_safe(kvm_uret_msrs_list[i], &value); - msrs->values[i].host = value; - msrs->values[i].curr = value; - } -} - -static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) -{ - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } -} - -int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) -{ - struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); - int err; - - value = (value & mask) | (msrs->values[slot].host & ~mask); - if (value == msrs->values[slot].curr) - return 0; - err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); - if (err) - return 1; - - msrs->values[slot].curr = value; - kvm_user_return_register_notifier(msrs); - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); - -u64 kvm_get_user_return_msr(unsigned int slot) -{ - return this_cpu_ptr(&user_return_msrs)->values[slot].curr; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); - -static void drop_user_return_notifiers(void) -{ - struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); - - if (msrs->registered) - kvm_on_user_return(&msrs->urn); -} - /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. * @@ -943,17 +560,6 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp); -static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) -{ - if (err) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | - EMULTYPE_COMPLETE_USER_EXIT); -} - void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault, bool from_hardware) { @@ -976,11 +582,12 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault, bool from_hardware) { - struct kvm_mmu *fault_mmu; + struct kvm_pagewalk *fault_walk; + WARN_ON_ONCE(fault->vector != PF_VECTOR); - fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : - vcpu->arch.walk_mmu; + fault_walk = fault->nested_page_fault ? &vcpu->arch.ngpa_walk : + &vcpu->arch.gva_walk; /* * Invalidate the TLB entry for the faulting address, if it exists, @@ -988,10 +595,10 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, */ if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) - kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, + kvm_mmu_invalidate_addr(vcpu, fault_walk, fault->address, KVM_MMU_ROOT_CURRENT); - fault_mmu->inject_page_fault(vcpu, fault, from_hardware); + fault_walk->inject_page_fault(vcpu, fault, from_hardware); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_inject_emulated_page_fault); @@ -1017,170 +624,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); -static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); -} - -/* - * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. - */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - gpa_t real_gpa; - int i; - int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - - /* - * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated - * to an L1 GPA. - */ - real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), - PFERR_USER_MASK | PFERR_WRITE_MASK | - PFERR_GUEST_PAGE_MASK, NULL, 0); - if (real_gpa == INVALID_GPA) - return 0; - - /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, - cr3 & GENMASK(11, 5), sizeof(pdpte)); - if (ret < 0) - return 0; - - for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if ((pdpte[i] & PT_PRESENT_MASK) && - (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - return 0; - } - } - - /* - * Marking VCPU_REG_PDPTR dirty doesn't work for !tdp_enabled. - * Shadow page roots need to be reconstructed instead. - */ - if (!tdp_enabled && memcmp(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs))) - kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); - - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); - kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR); - kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - vcpu->arch.pdptrs_from_userspace = false; - - return 1; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs); - -static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ -#ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) - return false; -#endif - - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return false; - - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return false; - - return kvm_x86_call(is_valid_cr0)(vcpu, cr0); -} - -void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) -{ - /* - * CR0.WP is incorporated into the MMU role, but only for non-nested, - * indirect shadow MMUs. If paging is disabled, no updates are needed - * as there are no permission bits to emulate. If TDP is enabled, the - * MMU's metadata needs to be updated, e.g. so that emulating guest - * translations does the right thing, but there's no need to unload the - * root as CR0.WP doesn't affect SPTEs. - */ - if ((cr0 ^ old_cr0) == X86_CR0_WP) { - if (!(cr0 & X86_CR0_PG)) - return; - - if (tdp_enabled) { - kvm_init_mmu(vcpu); - return; - } - } - - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - /* - * Clearing CR0.PG is defined to flush the TLB from the guest's - * perspective. - */ - if (!(cr0 & X86_CR0_PG)) - kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); - /* - * Check for async #PF completion events when enabling paging, - * as the vCPU may have previously encountered async #PFs (it's - * entirely legal for the guest to toggle paging on/off without - * waiting for the async #PF queue to drain). - */ - else if (kvm_pv_async_pf_enabled(vcpu)) - kvm_make_request(KVM_REQ_APF_READY, vcpu); - } - - if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) - kvm_mmu_reset_context(vcpu); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0); - -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - unsigned long old_cr0 = kvm_read_cr0(vcpu); - - if (!kvm_is_valid_cr0(vcpu, cr0)) - return 1; - - cr0 |= X86_CR0_ET; - - /* Write to CR0 reserved bits are ignored, even on Intel. */ - cr0 &= ~CR0_RESERVED_BITS; - -#ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && - (cr0 & X86_CR0_PG)) { - int cs_db, cs_l; - - if (!is_pae(vcpu)) - return 1; - kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; - } -#endif - if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && - is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && - !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) - return 1; - - if (!(cr0 & X86_CR0_PG) && - (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) - return 1; - - if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) - return 1; - - kvm_x86_call(set_cr0)(vcpu, cr0); - - kvm_post_set_cr0(vcpu, old_cr0, cr0); - - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0); - -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) -{ - (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); - static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) { if (vcpu->arch.guest_state_protected) @@ -1224,13 +667,6 @@ static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) } } -#ifdef CONFIG_X86_64 -static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; -} -#endif - int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; @@ -1290,89 +726,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv); -static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - return __kvm_is_valid_cr4(vcpu, cr4) && - kvm_x86_call(is_valid_cr4)(vcpu, cr4); -} - -void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) -{ - if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) - kvm_mmu_reset_context(vcpu); - - /* - * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB - * according to the SDM; however, stale prev_roots could be reused - * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we - * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST - * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, - * so fall through. - */ - if (!tdp_enabled && - (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) - kvm_mmu_unload(vcpu); - - /* - * The TLB has to be flushed for all PCIDs if any of the following - * (architecturally required) changes happen: - * - CR4.PCIDE is changed from 1 to 0 - * - CR4.PGE is toggled - * - * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. - */ - if (((cr4 ^ old_cr4) & X86_CR4_PGE) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) - kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); - - /* - * The TLB has to be flushed for the current PCID if any of the - * following (architecturally required) changes happen: - * - CR4.SMEP is changed from 0 to 1 - * - CR4.PAE is toggled - */ - else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || - ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4); - -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long old_cr4 = kvm_read_cr4(vcpu); - - if (!kvm_is_valid_cr4(vcpu, cr4)) - return 1; - - if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) - return 1; - if ((cr4 ^ old_cr4) & X86_CR4_LA57) - return 1; - } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) - && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) - && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) - return 1; - - if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { - /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) - return 1; - } - - if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) - return 1; - - kvm_x86_call(set_cr4)(vcpu, cr4); - - kvm_post_set_cr4(vcpu, old_cr4, cr4); - - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4); - -static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) +void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; unsigned long roots_to_free = 0; @@ -1415,167 +769,6 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); } -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - bool skip_tlb_flush = false; - unsigned long pcid = 0; -#ifdef CONFIG_X86_64 - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { - skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; - cr3 &= ~X86_CR3_PCID_NOFLUSH; - pcid = cr3 & X86_CR3_PCID_MASK; - } -#endif - - /* PDPTRs are always reloaded for PAE paging. */ - if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) - goto handle_tlb_flush; - - /* - * Do not condition the GPA check on long mode, this helper is used to - * stuff CR3, e.g. for RSM emulation, and there is no guarantee that - * the current vCPU mode is accurate. - */ - if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) - return 1; - - if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) - return 1; - - if (cr3 != kvm_read_cr3(vcpu)) - kvm_mmu_new_pgd(vcpu, cr3); - - vcpu->arch.cr3 = cr3; - kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); - /* Do not call post_set_cr3, we do not get here for confidential guests. */ - -handle_tlb_flush: - /* - * A load of CR3 that flushes the TLB flushes only the current PCID, - * even if PCID is disabled, in which case PCID=0 is flushed. It's a - * moot point in the end because _disabling_ PCID will flush all PCIDs, - * and it's impossible to use a non-zero PCID when PCID is disabled, - * i.e. only PCID=0 can be relevant. - */ - if (!skip_tlb_flush) - kvm_invalidate_pcid(vcpu, pcid); - - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3); - -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) -{ - if (cr8 & CR8_RESERVED_BITS) - return 1; - if (lapic_in_kernel(vcpu)) - kvm_lapic_set_tpr(vcpu, cr8); - else - vcpu->arch.cr8 = cr8; - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8); - -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) -{ - if (lapic_in_kernel(vcpu)) - return kvm_lapic_get_cr8(vcpu); - else - return vcpu->arch.cr8; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); - -static void kvm_update_dr0123(struct kvm_vcpu *vcpu) -{ - int i; - - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - for (i = 0; i < KVM_NR_DB_REGS; i++) - vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - } -} - -void kvm_update_dr7(struct kvm_vcpu *vcpu) -{ - unsigned long dr7; - - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - dr7 = vcpu->arch.guest_debug_dr7; - else - dr7 = vcpu->arch.dr7; - kvm_x86_call(set_dr7)(vcpu, dr7); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; - if (dr7 & DR7_BP_EN_MASK) - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); - -static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) -{ - u64 fixed = DR6_FIXED_1; - - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)) - fixed |= DR6_RTM; - - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) - fixed |= DR6_BUS_LOCK; - return fixed; -} - -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - size_t size = ARRAY_SIZE(vcpu->arch.db); - - switch (dr) { - case 0 ... 3: - vcpu->arch.db[array_index_nospec(dr, size)] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = val; - break; - case 4: - case 6: - if (!kvm_dr6_valid(val)) - return 1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); - break; - case 5: - default: /* 7 */ - if (!kvm_dr7_valid(val)) - return 1; /* #GP */ - vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - kvm_update_dr7(vcpu); - break; - } - - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr); - -unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) -{ - size_t size = ARRAY_SIZE(vcpu->arch.db); - - switch (dr) { - case 0 ... 3: - return vcpu->arch.db[array_index_nospec(dr, size)]; - case 4: - case 6: - return vcpu->arch.dr6; - case 5: - default: /* 7 */ - return vcpu->arch.dr7; - } -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); - -static unsigned long kvm_get_effective_dr7(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - return vcpu->arch.guest_debug_dr7; - - return vcpu->arch.dr7; -} - int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 pmc = kvm_ecx_read(vcpu); @@ -1592,595 +785,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); -/* - * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM - * does not yet virtualize. These include: - * 10 - MISC_PACKAGE_CTRLS - * 11 - ENERGY_FILTERING_CTL - * 12 - DOITM - * 18 - FB_CLEAR_CTRL - * 21 - XAPIC_DISABLE_STATUS - * 23 - OVERCLOCKING_STATUS - */ - -#define KVM_SUPPORTED_ARCH_CAP \ - (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ - ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ - ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ - ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ - ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) - -static u64 kvm_get_arch_capabilities(void) -{ - u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP; - - /* - * If nx_huge_pages is enabled, KVM's shadow paging will ensure that - * the nested hypervisor runs with NX huge pages. If it is not, - * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other - * L1 guests, so it need not worry about its own (L2) guests. - */ - data |= ARCH_CAP_PSCHANGE_MC_NO; - - /* - * If we're doing cache flushes (either "always" or "cond") - * we will do one whenever the guest does a vmlaunch/vmresume. - * If an outer hypervisor is doing the cache flush for us - * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that - * capability to the guest too, and if EPT is disabled we're not - * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will - * require a nested hypervisor to do a flush of its own. - */ - if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) - data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; - - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) - data |= ARCH_CAP_RDCL_NO; - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - data |= ARCH_CAP_SSB_NO; - if (!boot_cpu_has_bug(X86_BUG_MDS)) - data |= ARCH_CAP_MDS_NO; - if (!boot_cpu_has_bug(X86_BUG_RFDS)) - data |= ARCH_CAP_RFDS_NO; - if (!boot_cpu_has_bug(X86_BUG_ITS)) - data |= ARCH_CAP_ITS_NO; - - if (!boot_cpu_has(X86_FEATURE_RTM)) { - /* - * If RTM=0 because the kernel has disabled TSX, the host might - * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 - * and therefore knows that there cannot be TAA) but keep - * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, - * and we want to allow migrating those guests to tsx=off hosts. - */ - data &= ~ARCH_CAP_TAA_NO; - } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { - data |= ARCH_CAP_TAA_NO; - } else { - /* - * Nothing to do here; we emulate TSX_CTRL if present on the - * host so the guest can choose between disabling TSX or - * using VERW to clear CPU buffers. - */ - } - - if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) - data |= ARCH_CAP_GDS_NO; - - return data; -} - -static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) -{ - WARN_ON_ONCE(!host_initiated); - - switch (index) { - case MSR_IA32_ARCH_CAPABILITIES: - *data = kvm_get_arch_capabilities(); - break; - case MSR_IA32_PERF_CAPABILITIES: - *data = kvm_caps.supported_perf_cap; - break; - case MSR_PLATFORM_INFO: - *data = MSR_PLATFORM_INFO_CPUID_FAULT; - break; - case MSR_IA32_UCODE_REV: - rdmsrq_safe(index, data); - break; - default: - return kvm_x86_call(get_feature_msr)(index, data); - } - return 0; -} - -static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, - kvm_get_feature_msr); -} - -static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS)) - return false; - - if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT)) - return false; - - if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) - return false; - - if (efer & (EFER_LME | EFER_LMA) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) - return false; - - if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX)) - return false; - - return true; - -} -bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - if (efer & efer_reserved_bits) - return false; - - return __kvm_valid_efer(vcpu, efer); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); - -static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - u64 old_efer = vcpu->arch.efer; - u64 efer = msr_info->data; - int r; - - if (efer & efer_reserved_bits) - return 1; - - if (!msr_info->host_initiated) { - if (!__kvm_valid_efer(vcpu, efer)) - return 1; - - if (is_paging(vcpu) && - (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; - } - - efer &= ~EFER_LMA; - efer |= vcpu->arch.efer & EFER_LMA; - - r = kvm_x86_call(set_efer)(vcpu, efer); - if (r) { - WARN_ON(r > 0); - return r; - } - - if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) - kvm_mmu_reset_context(vcpu); - - if (!static_cpu_has(X86_FEATURE_XSAVES) && - (efer & EFER_SVME)) - kvm_hv_xsaves_xsavec_maybe_warn(vcpu); - - return 0; -} - -void kvm_enable_efer_bits(u64 mask) -{ - efer_reserved_bits &= ~mask; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits); - -bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) -{ - struct kvm_x86_msr_filter *msr_filter; - struct msr_bitmap_range *ranges; - struct kvm *kvm = vcpu->kvm; - bool allowed; - int idx; - u32 i; - - /* x2APIC MSRs do not support filtering. */ - if (index >= 0x800 && index <= 0x8ff) - return true; - - idx = srcu_read_lock(&kvm->srcu); - - msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); - if (!msr_filter) { - allowed = true; - goto out; - } - - allowed = msr_filter->default_allow; - ranges = msr_filter->ranges; - - for (i = 0; i < msr_filter->count; i++) { - u32 start = ranges[i].base; - u32 end = start + ranges[i].nmsrs; - u32 flags = ranges[i].flags; - unsigned long *bitmap = ranges[i].bitmap; - - if ((index >= start) && (index < end) && (flags & type)) { - allowed = test_bit(index - start, bitmap); - break; - } - } - -out: - srcu_read_unlock(&kvm->srcu, idx); - - return allowed; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); - -/* - * Write @data into the MSR specified by @index. Select MSR specific fault - * checks are bypassed if @host_initiated is %true. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, - bool host_initiated) -{ - struct msr_data msr; - - switch (index) { - case MSR_FS_BASE: - case MSR_GS_BASE: - case MSR_KERNEL_GS_BASE: - case MSR_CSTAR: - case MSR_LSTAR: - if (is_noncanonical_msr_address(data, vcpu)) - return 1; - break; - case MSR_IA32_SYSENTER_EIP: - case MSR_IA32_SYSENTER_ESP: - /* - * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if - * non-canonical address is written on Intel but not on - * AMD (which ignores the top 32-bits, because it does - * not implement 64-bit SYSENTER). - * - * 64-bit code should hence be able to write a non-canonical - * value on AMD. Making the address canonical ensures that - * vmentry does not fail on Intel after writing a non-canonical - * value, and that something deterministic happens if the guest - * invokes 64-bit SYSENTER. - */ - data = __canonical_address(data, max_host_virt_addr_bits()); - break; - case MSR_TSC_AUX: - if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) - return 1; - - if (!host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) - return 1; - - /* - * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has - * incomplete and conflicting architectural behavior. Current - * AMD CPUs completely ignore bits 63:32, i.e. they aren't - * reserved and always read as zeros. Enforce Intel's reserved - * bits check if the guest CPU is Intel compatible, otherwise - * clear the bits. This ensures cross-vendor migration will - * provide consistent behavior for the guest. - */ - if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0) - return 1; - - data = (u32)data; - break; - case MSR_IA32_U_CET: - case MSR_IA32_S_CET: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) - return KVM_MSR_RET_UNSUPPORTED; - if (!kvm_is_valid_u_s_cet(vcpu, data)) - return 1; - break; - case MSR_KVM_INTERNAL_GUEST_SSP: - if (!host_initiated) - return 1; - fallthrough; - /* - * Note that the MSR emulation here is flawed when a vCPU - * doesn't support the Intel 64 architecture. The expected - * architectural behavior in this case is that the upper 32 - * bits do not exist and should always read '0'. However, - * because the actual hardware on which the virtual CPU is - * running does support Intel 64, XRSTORS/XSAVES in the - * guest could observe behavior that violates the - * architecture. Intercepting XRSTORS/XSAVES for this - * special case isn't deemed worthwhile. - */ - case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) - return KVM_MSR_RET_UNSUPPORTED; - /* - * MSR_IA32_INT_SSP_TAB is not present on processors that do - * not support Intel 64 architecture. - */ - if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) - return KVM_MSR_RET_UNSUPPORTED; - if (is_noncanonical_msr_address(data, vcpu)) - return 1; - /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ - if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) - return 1; - break; - } - - msr.data = data; - msr.index = index; - msr.host_initiated = host_initiated; - - return kvm_x86_call(set_msr)(vcpu, &msr); -} - -static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) -{ - return __kvm_set_msr(vcpu, index, *data, host_initiated); -} - -static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, - u32 index, u64 data, bool host_initiated) -{ - return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, - _kvm_set_msr); -} - -/* - * Read the MSR specified by @index into @data. Select MSR specific fault - * checks are bypassed if @host_initiated is %true. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) -{ - struct msr_data msr; - int ret; - - switch (index) { - case MSR_TSC_AUX: - if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) - return 1; - - if (!host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) - return 1; - break; - case MSR_IA32_U_CET: - case MSR_IA32_S_CET: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) - return KVM_MSR_RET_UNSUPPORTED; - break; - case MSR_KVM_INTERNAL_GUEST_SSP: - if (!host_initiated) - return 1; - fallthrough; - case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) - return KVM_MSR_RET_UNSUPPORTED; - break; - } - - msr.index = index; - msr.host_initiated = host_initiated; - - ret = kvm_x86_call(get_msr)(vcpu, &msr); - if (!ret) - *data = msr.data; - return ret; -} - -int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) -{ - return __kvm_set_msr(vcpu, index, data, true); -} - -int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) -{ - return __kvm_get_msr(vcpu, index, data, true); -} - -static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, - u32 index, u64 *data, bool host_initiated) -{ - return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, - __kvm_get_msr); -} - -int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) -{ - return kvm_get_msr_ignored_check(vcpu, index, data, false); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read); - -int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) -{ - return kvm_set_msr_ignored_check(vcpu, index, data, false); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write); - -int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) -{ - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; - - return __kvm_emulate_msr_read(vcpu, index, data); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read); - -int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) -{ - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; - - return __kvm_emulate_msr_write(vcpu, index, data); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write); - - -static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) -{ - if (!vcpu->run->msr.error) { - kvm_eax_write(vcpu, vcpu->run->msr.data); - kvm_edx_write(vcpu, vcpu->run->msr.data >> 32); - } -} - -static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) -{ - return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); -} - -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) -{ - complete_userspace_rdmsr(vcpu); - return complete_emulated_msr_access(vcpu); -} - -static int complete_fast_msr_access(struct kvm_vcpu *vcpu) -{ - return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error); -} - -static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) -{ - complete_userspace_rdmsr(vcpu); - return complete_fast_msr_access(vcpu); -} - -static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) -{ - if (!vcpu->run->msr.error) - kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, - vcpu->run->msr.data); - - return complete_fast_msr_access(vcpu); -} - -static u64 kvm_msr_reason(int r) -{ - switch (r) { - case KVM_MSR_RET_UNSUPPORTED: - return KVM_MSR_EXIT_REASON_UNKNOWN; - case KVM_MSR_RET_FILTERED: - return KVM_MSR_EXIT_REASON_FILTER; - default: - return KVM_MSR_EXIT_REASON_INVAL; - } -} - -static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, - u32 exit_reason, u64 data, - int (*completion)(struct kvm_vcpu *vcpu), - int r) -{ - u64 msr_reason = kvm_msr_reason(r); - - /* Check if the user wanted to know about this MSR fault */ - if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) - return 0; - - vcpu->run->exit_reason = exit_reason; - vcpu->run->msr.error = 0; - memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); - vcpu->run->msr.reason = msr_reason; - vcpu->run->msr.index = index; - vcpu->run->msr.data = data; - vcpu->arch.complete_userspace_io = completion; - - return 1; -} - -static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, - int (*complete_rdmsr)(struct kvm_vcpu *)) -{ - u64 data; - int r; - - r = kvm_emulate_msr_read(vcpu, msr, &data); - - if (!r) { - trace_kvm_msr_read(msr, data); - - if (reg < 0) { - kvm_eax_write(vcpu, data); - kvm_edx_write(vcpu, data >> 32); - } else { - kvm_register_write(vcpu, reg, data); - } - } else { - /* MSR read failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, - complete_rdmsr, r)) - return 0; - trace_kvm_msr_read_ex(msr); - } - - return kvm_x86_call(complete_emulated_msr)(vcpu, r); -} - -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) -{ - return __kvm_emulate_rdmsr(vcpu, kvm_ecx_read(vcpu), -1, - complete_fast_rdmsr); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr); - -int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) -{ - vcpu->arch.cui_rdmsr_imm_reg = reg; - - return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm); - -static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - int r; - - r = kvm_emulate_msr_write(vcpu, msr, data); - if (!r) { - trace_kvm_msr_write(msr, data); - } else { - /* MSR write failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, - complete_fast_msr_access, r)) - return 0; - /* Signal all other negative errors to userspace */ - if (r < 0) - return r; - trace_kvm_msr_write_ex(msr, data); - } - - return kvm_x86_call(complete_emulated_msr)(vcpu, r); -} - -int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) -{ - return __kvm_emulate_wrmsr(vcpu, kvm_ecx_read(vcpu), - kvm_read_edx_eax(vcpu)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); - -int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) -{ - return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); - int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); @@ -2252,72 +856,6 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) - return EXIT_FASTPATH_NONE; - - switch (msr) { - case APIC_BASE_MSR + (APIC_ICR >> 4): - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || - kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) - return EXIT_FASTPATH_NONE; - break; - case MSR_IA32_TSC_DEADLINE: - kvm_set_lapic_tscdeadline_msr(vcpu, data); - break; - default: - return EXIT_FASTPATH_NONE; - } - - trace_kvm_msr_write(msr, data); - - if (!kvm_skip_emulated_instruction(vcpu)) - return EXIT_FASTPATH_EXIT_USERSPACE; - - return EXIT_FASTPATH_REENTER_GUEST; -} - -fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) -{ - return __handle_fastpath_wrmsr(vcpu, kvm_ecx_read(vcpu), - kvm_read_edx_eax(vcpu)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); - -fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) -{ - return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); - -/* - * Adapt set_msr() to msr_io()'s calling convention - */ -static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - return kvm_get_msr_ignored_check(vcpu, index, data, true); -} - -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - u64 val; - - /* - * Reject writes to immutable feature MSRs if the vCPU model is frozen, - * as KVM doesn't support modifying the guest vCPU model on the fly, - * e.g. changing the VMX capabilities MSRs while L2 is active is - * nonsensical. Allow writes of the same value, e.g. so that userspace - * can blindly stuff all MSRs when emulating RESET. - */ - if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) && - kvm_is_immutable_feature_msr(index) && - (do_get_msr(vcpu, index, &val) || *data != val)) - return -EINVAL; - - return kvm_set_msr_ignored_check(vcpu, index, *data, true); -} - #ifdef CONFIG_X86_64 struct pvclock_clock { int vclock_mode; @@ -2384,72 +922,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) -{ - int version; - int r; - struct pvclock_wall_clock wc; - u32 wc_sec_hi; - u64 wall_nsec; - - if (!wall_clock) - return; - - r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); - if (r) - return; - - if (version & 1) - ++version; /* first time write, random junk */ - - ++version; - - if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) - return; - - wall_nsec = kvm_get_wall_clock_epoch(kvm); - - wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); - wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ - wc.version = version; - - kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); - - if (sec_hi_ofs) { - wc_sec_hi = wall_nsec >> 32; - kvm_write_guest(kvm, wall_clock + sec_hi_ofs, - &wc_sec_hi, sizeof(wc_sec_hi)); - } - - version++; - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); -} - -static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, - bool old_msr, bool host_initiated) -{ - struct kvm_arch *ka = &vcpu->kvm->arch; - - if (vcpu->vcpu_id == 0 && !host_initiated) { - if (ka->boot_vcpu_runs_old_kvmclock != old_msr) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = old_msr; - } - - vcpu->arch.time = system_time; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - if (system_time & 1) - kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info)); - else - kvm_gpc_deactivate(&vcpu->arch.pv_time); - - return; -} - static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); @@ -2642,7 +1114,7 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio) return _tsc; } -static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; @@ -2683,7 +1155,7 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier); -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) +void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { if (vcpu->arch.guest_tsc_protected) return; @@ -2797,7 +1269,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, kvm_track_tsc_matching(vcpu, !matched); } -static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) +void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) { u64 data = user_value ? *user_value : 0; struct kvm *kvm = vcpu->kvm; @@ -2865,22 +1337,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } -static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, - s64 adjustment) -{ - u64 tsc_offset = vcpu->arch.l1_tsc_offset; - kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); -} - -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) - WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc((u64) adjustment, - vcpu->arch.l1_tsc_scaling_ratio); - adjust_tsc_offset_guest(vcpu, adjustment); -} - #ifdef CONFIG_X86_64 static u64 read_tsc(void) @@ -3510,151 +1966,6 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } -/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ -static bool is_mci_control_msr(u32 msr) -{ - return (msr & 3) == 0; -} -static bool is_mci_status_msr(u32 msr) -{ - return (msr & 3) == 1; -} - -/* - * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. - */ -static bool can_set_mci_status(struct kvm_vcpu *vcpu) -{ - /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_compatible(vcpu)) - return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); - - return false; -} - -static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - u32 msr = msr_info->index; - u64 data = msr_info->data; - u32 offset, last_msr; - - switch (msr) { - case MSR_IA32_MCG_STATUS: - vcpu->arch.mcg_status = data; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P) && - (data || !msr_info->host_initiated)) - return 1; - if (data != 0 && data != ~(u64)0) - return 1; - vcpu->arch.mcg_ctl = data; - break; - case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: - last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; - if (msr > last_msr) - return 1; - - if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) - return 1; - /* An attempt to write a 1 to a reserved bit raises #GP */ - if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) - return 1; - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, - last_msr + 1 - MSR_IA32_MC0_CTL2); - vcpu->arch.mci_ctl2_banks[offset] = data; - break; - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; - if (msr > last_msr) - return 1; - - /* - * Only 0 or all 1s can be written to IA32_MCi_CTL, all other - * values are architecturally undefined. But, some Linux - * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB - * issue on AMD K8s, allow bit 10 to be clear when setting all - * other bits in order to avoid an uncaught #GP in the guest. - * - * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, - * single-bit ECC data errors. - */ - if (is_mci_control_msr(msr) && - data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return 1; - - /* - * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. - * AMD-based CPUs allow non-zero values, but if and only if - * HWCR[McStatusWrEn] is set. - */ - if (!msr_info->host_initiated && is_mci_status_msr(msr) && - data != 0 && !can_set_mci_status(vcpu)) - return 1; - - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, - last_msr + 1 - MSR_IA32_MC0_CTL); - vcpu->arch.mce_banks[offset] = data; - break; - default: - return 1; - } - return 0; -} - -static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) -{ - gpa_t gpa = data & ~0x3f; - - /* Bits 4:5 are reserved, Should be zero */ - if (data & 0x30) - return 1; - - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && - (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) - return 1; - - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && - (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) - return 1; - - if (!lapic_in_kernel(vcpu)) - return data ? 1 : 0; - - if (__kvm_pv_async_pf_enabled(data) && - kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u64))) - return 1; - - vcpu->arch.apf.msr_en_val = data; - - if (__kvm_pv_async_pf_enabled(data)) { - kvm_async_pf_wakeup_all(vcpu); - } else { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } - return 0; -} - -static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) -{ - /* Bits 8-63 are reserved */ - if (data >> 8) - return 1; - - if (!lapic_in_kernel(vcpu)) - return 1; - - vcpu->arch.apf.msr_int_val = data; - - vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; - - return 0; -} - static void kvmclock_reset(struct kvm_vcpu *vcpu) { kvm_gpc_deactivate(&vcpu->arch.pv_time); @@ -3815,899 +2126,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } -/* - * Returns true if the MSR in question is managed via XSTATE, i.e. is context - * switched with the rest of guest FPU state. - * - * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. - */ -static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - if (!vcpu) - return false; - - switch (msr) { - case MSR_IA32_U_CET: - return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || - guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); - case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: - return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); - default: - return false; - } -} - -/* - * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an - * MSR that is managed via XSTATE. Note, the caller is responsible for doing - * the initial FPU load, this helper only ensures that guest state is resident - * in hardware (the kernel can load its FPU state in IRQ context). - * - * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the - * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only - * consumed when transitioning to lower privilege levels, i.e. are effectively - * only consumed by userspace as well. - */ -static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, - struct msr_data *msr_info, - int access) -{ - BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); - - KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); - KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); - - kvm_fpu_get(); - if (access == MSR_TYPE_R) - rdmsrq(msr_info->index, msr_info->data); - else - wrmsrq(msr_info->index, msr_info->data); - kvm_fpu_put(); -} - -static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); -} - -static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); -} - -int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - u32 msr = msr_info->index; - u64 data = msr_info->data; - - /* - * Do not allow host-initiated writes to trigger the Xen hypercall - * page setup; it could incur locking paths which are not expected - * if userspace sets the MSR in an unusual location. - */ - if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && - !msr_info->host_initiated) - return kvm_xen_write_hypercall_page(vcpu, data); - - switch (msr) { - case MSR_AMD64_NB_CFG: - case MSR_IA32_UCODE_WRITE: - case MSR_VM_HSAVE_PA: - case MSR_AMD64_PATCH_LOADER: - case MSR_AMD64_BU_CFG2: - case MSR_AMD64_DC_CFG: - case MSR_AMD64_TW_CFG: - case MSR_F15H_EX_CFG: - break; - - case MSR_IA32_UCODE_REV: - if (msr_info->host_initiated) - vcpu->arch.microcode_version = data; - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated || - !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return KVM_MSR_RET_UNSUPPORTED; - vcpu->arch.arch_capabilities = data; - break; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated || - !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) - return KVM_MSR_RET_UNSUPPORTED; - - if (data & ~kvm_caps.supported_perf_cap) - return 1; - - /* - * Note, this is not just a performance optimization! KVM - * disallows changing feature MSRs after the vCPU has run; PMU - * refresh will bug the VM if called after the vCPU has run. - */ - if (vcpu->arch.perf_capabilities == data) - break; - - vcpu->arch.perf_capabilities = data; - kvm_pmu_refresh(vcpu); - kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); - break; - case MSR_IA32_PRED_CMD: { - u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); - - if (!msr_info->host_initiated) { - if ((!guest_has_pred_cmd_msr(vcpu))) - return 1; - - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB)) - reserved_bits |= PRED_CMD_IBPB; - - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)) - reserved_bits |= PRED_CMD_SBPB; - } - - if (!boot_cpu_has(X86_FEATURE_IBPB)) - reserved_bits |= PRED_CMD_IBPB; - - if (!boot_cpu_has(X86_FEATURE_SBPB)) - reserved_bits |= PRED_CMD_SBPB; - - if (data & reserved_bits) - return 1; - - if (!data) - break; - - wrmsrq(MSR_IA32_PRED_CMD, data); - break; - } - case MSR_IA32_FLUSH_CMD: - if (!msr_info->host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)) - return 1; - - if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) - return 1; - if (!data) - break; - - wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - break; - case MSR_EFER: - return set_efer(vcpu, msr_info); - case MSR_K7_HWCR: { - /* - * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 - * through at least v6.6 whine if TscFreqSel is clear, - * depending on F/M/S. - */ - u64 valid = BIT_ULL(18) | BIT_ULL(24); - - data &= ~(u64)0x40; /* ignore flush filter disable */ - data &= ~(u64)0x100; /* ignore ignne emulation enable */ - data &= ~(u64)0x8; /* ignore TLB cache disable */ - - if (guest_cpu_cap_has(vcpu, X86_FEATURE_GP_ON_USER_CPUID)) - valid |= MSR_K7_HWCR_CPUID_USER_DIS; - - if (data & ~valid) { - kvm_pr_unimpl_wrmsr(vcpu, msr, data); - return 1; - } - vcpu->arch.msr_hwcr = data; - break; - } - case MSR_FAM10H_MMIO_CONF_BASE: - if (data != 0) { - kvm_pr_unimpl_wrmsr(vcpu, msr, data); - return 1; - } - break; - case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; - - vcpu->arch.pat = data; - break; - case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: - case MSR_MTRRdefType: - return kvm_mtrr_set_msr(vcpu, msr, data); - case MSR_IA32_APICBASE: - return kvm_apic_set_base(vcpu, data, msr_info->host_initiated); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: - return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSC_DEADLINE: - kvm_set_lapic_tscdeadline_msr(vcpu, data); - break; - case MSR_IA32_TSC_ADJUST: - if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) { - if (!msr_info->host_initiated) { - s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; - adjust_tsc_offset_guest(vcpu, adj); - /* Before back to guest, tsc_timestamp must be adjusted - * as well, otherwise guest's percpu pvclock time could jump. - */ - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - } - vcpu->arch.ia32_tsc_adjust_msr = data; - } - break; - case MSR_IA32_MISC_ENABLE: { - u64 old_val = vcpu->arch.ia32_misc_enable_msr; - - if (!msr_info->host_initiated) { - /* RO bits */ - if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) - return 1; - - /* R bits, i.e. writes are ignored, but don't fault. */ - data = data & ~MSR_IA32_MISC_ENABLE_EMON; - data |= old_val & MSR_IA32_MISC_ENABLE_EMON; - } - - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) - return 1; - vcpu->arch.ia32_misc_enable_msr = data; - vcpu->arch.cpuid_dynamic_bits_dirty = true; - } else { - vcpu->arch.ia32_misc_enable_msr = data; - } - break; - } - case MSR_IA32_SMBASE: - if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) - return 1; - vcpu->arch.smbase = data; - break; - case MSR_IA32_POWER_CTL: - vcpu->arch.msr_ia32_power_ctl = data; - break; - case MSR_IA32_TSC: - if (msr_info->host_initiated) { - kvm_synchronize_tsc(vcpu, &data); - } else if (!vcpu->arch.guest_tsc_protected) { - u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; - adjust_tsc_offset_guest(vcpu, adj); - vcpu->arch.ia32_tsc_adjust_msr += adj; - } - break; - case MSR_IA32_XSS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return KVM_MSR_RET_UNSUPPORTED; - - if (data & ~vcpu->arch.guest_supported_xss) - return 1; - if (vcpu->arch.ia32_xss == data) - break; - vcpu->arch.ia32_xss = data; - vcpu->arch.cpuid_dynamic_bits_dirty = true; - break; - case MSR_SMI_COUNT: - if (!msr_info->host_initiated) - return 1; - vcpu->arch.smi_count = data; - break; - case MSR_KVM_WALL_CLOCK_NEW: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return KVM_MSR_RET_UNSUPPORTED; - - vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data, 0); - break; - case MSR_KVM_WALL_CLOCK: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return KVM_MSR_RET_UNSUPPORTED; - - vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data, 0); - break; - case MSR_KVM_SYSTEM_TIME_NEW: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return KVM_MSR_RET_UNSUPPORTED; - - kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); - break; - case MSR_KVM_SYSTEM_TIME: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return KVM_MSR_RET_UNSUPPORTED; - - kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); - break; - case MSR_KVM_ASYNC_PF_EN: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return KVM_MSR_RET_UNSUPPORTED; - - if (kvm_pv_enable_async_pf(vcpu, data)) - return 1; - break; - case MSR_KVM_ASYNC_PF_INT: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return KVM_MSR_RET_UNSUPPORTED; - - if (kvm_pv_enable_async_pf_int(vcpu, data)) - return 1; - break; - case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return KVM_MSR_RET_UNSUPPORTED; - if (data & 0x1) { - /* - * Pairs with the smp_mb__after_atomic() in - * kvm_arch_async_page_present_queued(). - */ - smp_store_mb(vcpu->arch.apf.pageready_pending, false); - - kvm_check_async_pf_completion(vcpu); - } - break; - case MSR_KVM_STEAL_TIME: - if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return KVM_MSR_RET_UNSUPPORTED; - - if (unlikely(!sched_info_on())) - return 1; - - if (data & KVM_STEAL_RESERVED_MASK) - return 1; - - vcpu->arch.st.msr_val = data; - - if (!(data & KVM_MSR_ENABLED)) - break; - - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - - break; - case MSR_KVM_PV_EOI_EN: - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return KVM_MSR_RET_UNSUPPORTED; - - if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) - return 1; - break; - - case MSR_KVM_POLL_CONTROL: - if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return KVM_MSR_RET_UNSUPPORTED; - - /* only enable bit supported */ - if (data & (-1ULL << 1)) - return 1; - - vcpu->arch.msr_kvm_poll_control = data; - break; - - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: - return set_msr_mce(vcpu, msr_info); - - case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: - case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: - case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr_info); - - if (data) - kvm_pr_unimpl_wrmsr(vcpu, msr, data); - break; - case MSR_K7_CLK_CTL: - /* - * Ignore all writes to this no longer documented MSR. - * Writes are only relevant for old K7 processors, - * all pre-dating SVM, but a recommended workaround from - * AMD for these chips. It is possible to specify the - * affected processor models on the command line, hence - * the need to ignore the workaround. - */ - break; -#ifdef CONFIG_KVM_HYPERV - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: - case HV_X64_MSR_SYNDBG_OPTIONS: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: - case HV_X64_MSR_REENLIGHTENMENT_CONTROL: - case HV_X64_MSR_TSC_EMULATION_CONTROL: - case HV_X64_MSR_TSC_EMULATION_STATUS: - case HV_X64_MSR_TSC_INVARIANT_CONTROL: - return kvm_hv_set_msr_common(vcpu, msr, data, - msr_info->host_initiated); -#endif - case MSR_IA32_BBL_CR_CTL3: - /* Drop writes to this legacy MSR -- see rdmsr - * counterpart for further detail. - */ - kvm_pr_unimpl_wrmsr(vcpu, msr, data); - break; - case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) - return 1; - vcpu->arch.osvw.length = data; - break; - case MSR_AMD64_OSVW_STATUS: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) - return 1; - vcpu->arch.osvw.status = data; - break; - case MSR_PLATFORM_INFO: - if (!msr_info->host_initiated) - return 1; - vcpu->arch.msr_platform_info = data; - break; - case MSR_MISC_FEATURES_ENABLES: - if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || - (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && - !(vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT))) - return 1; - vcpu->arch.msr_misc_features_enables = data; - break; -#ifdef CONFIG_X86_64 - case MSR_IA32_XFD: - if (!msr_info->host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) - return 1; - - if (data & ~kvm_guest_supported_xfd(vcpu)) - return 1; - - fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); - break; - case MSR_IA32_XFD_ERR: - if (!msr_info->host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) - return 1; - - if (data & ~kvm_guest_supported_xfd(vcpu)) - return 1; - - vcpu->arch.guest_fpu.xfd_err = data; - break; -#endif - case MSR_IA32_U_CET: - case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: - kvm_set_xstate_msr(vcpu, msr_info); - break; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr_info); - - return KVM_MSR_RET_UNSUPPORTED; - } - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common); - -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) -{ - u64 data; - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - u32 offset, last_msr; - - switch (msr) { - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - data = 0; - break; - case MSR_IA32_MCG_CAP: - data = vcpu->arch.mcg_cap; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P) && !host) - return 1; - data = vcpu->arch.mcg_ctl; - break; - case MSR_IA32_MCG_STATUS: - data = vcpu->arch.mcg_status; - break; - case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: - last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; - if (msr > last_msr) - return 1; - - if (!(mcg_cap & MCG_CMCI_P) && !host) - return 1; - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, - last_msr + 1 - MSR_IA32_MC0_CTL2); - data = vcpu->arch.mci_ctl2_banks[offset]; - break; - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; - if (msr > last_msr) - return 1; - - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, - last_msr + 1 - MSR_IA32_MC0_CTL); - data = vcpu->arch.mce_banks[offset]; - break; - default: - return 1; - } - *pdata = data; - return 0; -} - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - switch (msr_info->index) { - case MSR_IA32_PLATFORM_ID: - case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_LASTBRANCHFROMIP: - case MSR_IA32_LASTBRANCHTOIP: - case MSR_IA32_LASTINTFROMIP: - case MSR_IA32_LASTINTTOIP: - case MSR_AMD64_SYSCFG: - case MSR_K8_TSEG_ADDR: - case MSR_K8_TSEG_MASK: - case MSR_VM_HSAVE_PA: - case MSR_K8_INT_PENDING_MSG: - case MSR_AMD64_NB_CFG: - case MSR_FAM10H_MMIO_CONF_BASE: - case MSR_AMD64_BU_CFG2: - case MSR_IA32_PERF_CTL: - case MSR_AMD64_DC_CFG: - case MSR_AMD64_TW_CFG: - case MSR_F15H_EX_CFG: - /* - * Intel Sandy Bridge CPUs must support the RAPL (running average power - * limit) MSRs. Just return 0, as we do not want to expose the host - * data here. Do not conditionalize this on CPUID, as KVM does not do - * so for existing CPU-specific MSRs. - */ - case MSR_RAPL_POWER_UNIT: - case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ - case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ - case MSR_PKG_ENERGY_STATUS: /* Total package */ - case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ - msr_info->data = 0; - break; - case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: - case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - msr_info->data = 0; - break; - case MSR_IA32_UCODE_REV: - msr_info->data = vcpu->arch.microcode_version; - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return KVM_MSR_RET_UNSUPPORTED; - msr_info->data = vcpu->arch.arch_capabilities; - break; - case MSR_IA32_PERF_CAPABILITIES: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) - return KVM_MSR_RET_UNSUPPORTED; - msr_info->data = vcpu->arch.perf_capabilities; - break; - case MSR_IA32_POWER_CTL: - msr_info->data = vcpu->arch.msr_ia32_power_ctl; - break; - case MSR_IA32_TSC: { - /* - * Intel SDM states that MSR_IA32_TSC read adds the TSC offset - * even when not intercepted. AMD manual doesn't explicitly - * state this but appears to behave the same. - * - * On userspace reads and writes, however, we unconditionally - * return L1's TSC value to ensure backwards-compatible - * behavior for migration. - */ - u64 offset, ratio; - - if (msr_info->host_initiated) { - offset = vcpu->arch.l1_tsc_offset; - ratio = vcpu->arch.l1_tsc_scaling_ratio; - } else { - offset = vcpu->arch.tsc_offset; - ratio = vcpu->arch.tsc_scaling_ratio; - } - - msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; - break; - } - case MSR_IA32_CR_PAT: - msr_info->data = vcpu->arch.pat; - break; - case MSR_MTRRcap: - case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: - case MSR_MTRRdefType: - return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); - case 0xcd: /* fsb frequency */ - msr_info->data = 3; - break; - /* - * MSR_EBC_FREQUENCY_ID - * Conservative value valid for even the basic CPU models. - * Models 0,1: 000 in bits 23:21 indicating a bus speed of - * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, - * and 266MHz for model 3, or 4. Set Core Clock - * Frequency to System Bus Frequency Ratio to 1 (bits - * 31:24) even though these are only valid for CPU - * models > 2, however guests may end up dividing or - * multiplying by zero otherwise. - */ - case MSR_EBC_FREQUENCY_ID: - msr_info->data = 1 << 24; - break; - case MSR_IA32_APICBASE: - msr_info->data = vcpu->arch.apic_base; - break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: - return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - case MSR_IA32_TSC_DEADLINE: - msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); - break; - case MSR_IA32_TSC_ADJUST: - msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; - break; - case MSR_IA32_MISC_ENABLE: - msr_info->data = vcpu->arch.ia32_misc_enable_msr; - break; - case MSR_IA32_SMBASE: - if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) - return 1; - msr_info->data = vcpu->arch.smbase; - break; - case MSR_SMI_COUNT: - msr_info->data = vcpu->arch.smi_count; - break; - case MSR_IA32_PERF_STATUS: - /* TSC increment by tick */ - msr_info->data = 1000ULL; - /* CPU multiplier */ - msr_info->data |= (((uint64_t)4ULL) << 40); - break; - case MSR_EFER: - msr_info->data = vcpu->arch.efer; - break; - case MSR_KVM_WALL_CLOCK: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->kvm->arch.wall_clock; - break; - case MSR_KVM_WALL_CLOCK_NEW: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->kvm->arch.wall_clock; - break; - case MSR_KVM_SYSTEM_TIME: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.time; - break; - case MSR_KVM_SYSTEM_TIME_NEW: - if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.time; - break; - case MSR_KVM_ASYNC_PF_EN: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.apf.msr_en_val; - break; - case MSR_KVM_ASYNC_PF_INT: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.apf.msr_int_val; - break; - case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = 0; - break; - case MSR_KVM_STEAL_TIME: - if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.st.msr_val; - break; - case MSR_KVM_PV_EOI_EN: - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.pv_eoi.msr_val; - break; - case MSR_KVM_POLL_CONTROL: - if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return KVM_MSR_RET_UNSUPPORTED; - - msr_info->data = vcpu->arch.msr_kvm_poll_control; - break; - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr_info->index, &msr_info->data, - msr_info->host_initiated); - case MSR_IA32_XSS: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return 1; - msr_info->data = vcpu->arch.ia32_xss; - break; - case MSR_K7_CLK_CTL: - /* - * Provide expected ramp-up count for K7. All other - * are set to zero, indicating minimum divisors for - * every field. - * - * This prevents guest kernels on AMD host with CPU - * type 6, model 8 and higher from exploding due to - * the rdmsr failing. - */ - msr_info->data = 0x20000000; - break; -#ifdef CONFIG_KVM_HYPERV - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: - case HV_X64_MSR_SYNDBG_OPTIONS: - case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: - case HV_X64_MSR_CRASH_CTL: - case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: - case HV_X64_MSR_REENLIGHTENMENT_CONTROL: - case HV_X64_MSR_TSC_EMULATION_CONTROL: - case HV_X64_MSR_TSC_EMULATION_STATUS: - case HV_X64_MSR_TSC_INVARIANT_CONTROL: - return kvm_hv_get_msr_common(vcpu, - msr_info->index, &msr_info->data, - msr_info->host_initiated); -#endif - case MSR_IA32_BBL_CR_CTL3: - /* This legacy MSR exists but isn't fully documented in current - * silicon. It is however accessed by winxp in very narrow - * scenarios where it sets bit #19, itself documented as - * a "reserved" bit. Best effort attempt to source coherent - * read data here should the balance of the register be - * interpreted by the guest: - * - * L2 cache control register 3: 64GB range, 256KB size, - * enabled, latency 0x1, configured - */ - msr_info->data = 0xbe702111; - break; - case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) - return 1; - msr_info->data = vcpu->arch.osvw.length; - break; - case MSR_AMD64_OSVW_STATUS: - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) - return 1; - msr_info->data = vcpu->arch.osvw.status; - break; - case MSR_PLATFORM_INFO: - if (!msr_info->host_initiated && - !vcpu->kvm->arch.guest_can_read_msr_platform_info) - return 1; - msr_info->data = vcpu->arch.msr_platform_info; - break; - case MSR_MISC_FEATURES_ENABLES: - msr_info->data = vcpu->arch.msr_misc_features_enables; - break; - case MSR_K7_HWCR: - msr_info->data = vcpu->arch.msr_hwcr; - break; -#ifdef CONFIG_X86_64 - case MSR_IA32_XFD: - if (!msr_info->host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) - return 1; - - msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; - break; - case MSR_IA32_XFD_ERR: - if (!msr_info->host_initiated && - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) - return 1; - - msr_info->data = vcpu->arch.guest_fpu.xfd_err; - break; -#endif - case MSR_IA32_U_CET: - case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: - kvm_get_xstate_msr(vcpu, msr_info); - break; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - - return KVM_MSR_RET_UNSUPPORTED; - } - return 0; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); - -/* - * Read or write a bunch of msrs. All parameters are kernel addresses. - * - * @return number of msrs set successfully. - */ -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, - struct kvm_msr_entry *entries, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data)) -{ - bool fpu_loaded = false; - int i; - - for (i = 0; i < msrs->nmsrs; ++i) { - /* - * If userspace is accessing one or more XSTATE-managed MSRs, - * temporarily load the guest's FPU state so that the guest's - * MSR value(s) is resident in hardware and thus can be accessed - * via RDMSR/WRMSR. - */ - if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { - kvm_load_guest_fpu(vcpu); - fpu_loaded = true; - } - if (do_msr(vcpu, entries[i].index, &entries[i].data)) - break; - } - if (fpu_loaded) - kvm_put_guest_fpu(vcpu); - - return i; -} - -/* - * Read or write a bunch of msrs. Parameters are user addresses. - * - * @return number of msrs set successfully. - */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data), - int writeback) -{ - struct kvm_msrs msrs; - struct kvm_msr_entry *entries; - unsigned size; - int r; - - r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) - goto out; - - r = -E2BIG; - if (msrs.nmsrs >= MAX_IO_MSRS) - goto out; - - size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = memdup_user(user_msrs->entries, size); - if (IS_ERR(entries)) { - r = PTR_ERR(entries); - goto out; - } - - r = __msr_io(vcpu, &msrs, entries, do_msr); - - if (writeback && copy_to_user(user_msrs->entries, entries, size)) - r = -EFAULT; - - kfree(entries); -out: - return r; -} - static inline bool kvm_can_mwait_in_guest(void) { return boot_cpu_has(X86_FEATURE_MWAIT) && @@ -5026,32 +2444,9 @@ long kvm_arch_dev_ioctl(struct file *filp, long r; switch (ioctl) { - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; - unsigned n; - - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; - if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) - goto out; - r = -E2BIG; - if (n < msr_list.nmsrs) - goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) - goto out; - if (copy_to_user(user_msr_list->indices + num_msrs_to_save, - &emulated_msrs, - num_emulated_msrs * sizeof(u32))) - goto out; - r = 0; + case KVM_GET_MSR_INDEX_LIST: + r = kvm_get_msr_index_list(argp); break; - } case KVM_GET_SUPPORTED_CPUID: case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; @@ -5079,30 +2474,11 @@ long kvm_arch_dev_ioctl(struct file *filp, goto out; r = 0; break; - case KVM_GET_MSR_FEATURE_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; - unsigned int n; - - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msr_based_features; - if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) - goto out; - r = -E2BIG; - if (n < msr_list.nmsrs) - goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msr_based_features, - num_msr_based_features * sizeof(u32))) - goto out; - r = 0; + case KVM_GET_MSR_FEATURE_INDEX_LIST: + r = kvm_get_feature_msr_index_list(argp); break; - } case KVM_GET_MSRS: - r = msr_io(NULL, argp, do_get_feature_msr, 1); + r = kvm_get_feature_msrs(argp); break; #ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: @@ -5322,6 +2698,18 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, return 0; } +static bool kvm_is_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* + * Note, .interrupt_allowed() returns -EBUSY if interrupts are allowed + * based on CPU state, but can't be immediately delivered due to a + * pending nested VM-Enter. Treat that case as "allowed", because + * the goal here is just to check if interrupts are architecturally + * allowed, not to check if they can be injected. + */ + return kvm_x86_call(interrupt_allowed)(vcpu, false); +} + static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) { /* @@ -5347,7 +2735,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) * or KVM_SET_SREGS. For that to work, we must be at an * instruction boundary and with no events half-injected. */ - return (kvm_arch_interrupt_allowed(vcpu) && + return (kvm_is_interrupt_allowed(vcpu) && kvm_cpu_accept_dm_intr(vcpu) && !kvm_event_needs_reinjection(vcpu) && !kvm_is_exception_pending(vcpu)); @@ -5532,7 +2920,7 @@ static struct kvm_queued_exception *kvm_get_exception_to_save(struct kvm_vcpu *v return &vcpu->arch.exception; } -static void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu) +void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu); @@ -5736,57 +3124,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return 0; } -static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - unsigned int i; - - if (vcpu->kvm->arch.has_protected_state && - vcpu->arch.guest_state_protected) - return -EINVAL; - - kvm_handle_exception_payload_quirk(vcpu); - - memset(dbgregs, 0, sizeof(*dbgregs)); - - BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) - dbgregs->db[i] = vcpu->arch.db[i]; - - dbgregs->dr6 = vcpu->arch.dr6; - dbgregs->dr7 = vcpu->arch.dr7; - return 0; -} - -static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - unsigned int i; - - if (vcpu->kvm->arch.has_protected_state && - vcpu->arch.guest_state_protected) - return -EINVAL; - - if (dbgregs->flags) - return -EINVAL; - - if (!kvm_dr6_valid(dbgregs->dr6)) - return -EINVAL; - if (!kvm_dr7_valid(dbgregs->dr7)) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) - vcpu->arch.db[i] = dbgregs->db[i]; - - kvm_update_dr0123(vcpu); - vcpu->arch.dr6 = dbgregs->dr6; - vcpu->arch.dr7 = dbgregs->dr7; - kvm_update_dr7(vcpu); - - return 0; -} - - static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, u8 *state, unsigned int size) { @@ -6058,134 +3395,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } } -struct kvm_x86_reg_id { - __u32 index; - __u8 type; - __u8 rsvd1; - __u8 rsvd2:4; - __u8 size:4; - __u8 x86; -}; - -static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, - struct kvm_x86_reg_id *reg) -{ - switch (reg->index) { - case KVM_REG_GUEST_SSP: - /* - * FIXME: If host-initiated accesses are ever exempted from - * ignore_msrs (in kvm_do_msr_access()), drop this manual check - * and rely on KVM's standard checks to reject accesses to regs - * that don't exist. - */ - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) - return -EINVAL; - - reg->type = KVM_X86_REG_TYPE_MSR; - reg->index = MSR_KVM_INTERNAL_GUEST_SSP; - break; - default: - return -EINVAL; - } - return 0; -} - -static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) -{ - u64 val; - - if (do_get_msr(vcpu, msr, &val)) - return -EINVAL; - - if (put_user(val, user_val)) - return -EFAULT; - - return 0; -} - -static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) -{ - u64 val; - - if (get_user(val, user_val)) - return -EFAULT; - - if (do_set_msr(vcpu, msr, &val)) - return -EINVAL; - - return 0; -} - -static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, - void __user *argp) -{ - struct kvm_one_reg one_reg; - struct kvm_x86_reg_id *reg; - u64 __user *user_val; - bool load_fpu; - int r; - - if (copy_from_user(&one_reg, argp, sizeof(one_reg))) - return -EFAULT; - - if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) - return -EINVAL; - - reg = (struct kvm_x86_reg_id *)&one_reg.id; - if (reg->rsvd1 || reg->rsvd2) - return -EINVAL; - - if (reg->type == KVM_X86_REG_TYPE_KVM) { - r = kvm_translate_kvm_reg(vcpu, reg); - if (r) - return r; - } - - if (reg->type != KVM_X86_REG_TYPE_MSR) - return -EINVAL; - - if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) - return -EINVAL; - - guard(srcu)(&vcpu->kvm->srcu); - - load_fpu = is_xstate_managed_msr(vcpu, reg->index); - if (load_fpu) - kvm_load_guest_fpu(vcpu); - - user_val = u64_to_user_ptr(one_reg.addr); - if (ioctl == KVM_GET_ONE_REG) - r = kvm_get_one_msr(vcpu, reg->index, user_val); - else - r = kvm_set_one_msr(vcpu, reg->index, user_val); - - if (load_fpu) - kvm_put_guest_fpu(vcpu); - return r; -} - -static int kvm_get_reg_list(struct kvm_vcpu *vcpu, - struct kvm_reg_list __user *user_list) -{ - u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; - u64 user_nr_regs; - - if (get_user(user_nr_regs, &user_list->n)) - return -EFAULT; - - if (put_user(nr_regs, &user_list->n)) - return -EFAULT; - - if (user_nr_regs < nr_regs) - return -E2BIG; - - if (nr_regs && - put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) - return -EFAULT; - - return 0; -} - long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -6290,18 +3499,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_MSRS: { - int idx = srcu_read_lock(&vcpu->kvm->srcu); - r = msr_io(vcpu, argp, do_get_msr, 1); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + case KVM_GET_MSRS: + r = kvm_get_msrs(vcpu, argp); break; - } - case KVM_SET_MSRS: { - int idx = srcu_read_lock(&vcpu->kvm->srcu); - r = msr_io(vcpu, argp, do_set_msr, 0); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + case KVM_SET_MSRS: + r = kvm_set_msrs(vcpu, argp); break; - } case KVM_GET_ONE_REG: case KVM_SET_ONE_REG: r = kvm_get_set_one_reg(vcpu, ioctl, argp); @@ -6623,7 +3826,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -ENOMEM; if (!u.sregs2) goto out; - __get_sregs2(vcpu, u.sregs2); + kvm_vcpu_ioctl_x86_get_sregs2(vcpu, u.sregs2); r = -EFAULT; if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2))) goto out; @@ -6642,7 +3845,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u.sregs2 = NULL; goto out; } - r = __set_sregs2(vcpu, u.sregs2); + r = kvm_vcpu_ioctl_x86_set_sregs2(vcpu, u.sregs2); break; } case KVM_HAS_DEVICE_ATTR: @@ -6994,113 +4197,6 @@ disable_exits_unlock: return r; } -static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) -{ - struct kvm_x86_msr_filter *msr_filter; - - msr_filter = kzalloc_obj(*msr_filter, GFP_KERNEL_ACCOUNT); - if (!msr_filter) - return NULL; - - msr_filter->default_allow = default_allow; - return msr_filter; -} - -static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) -{ - u32 i; - - if (!msr_filter) - return; - - for (i = 0; i < msr_filter->count; i++) - kfree(msr_filter->ranges[i].bitmap); - - kfree(msr_filter); -} - -static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, - struct kvm_msr_filter_range *user_range) -{ - unsigned long *bitmap; - size_t bitmap_size; - - if (!user_range->nmsrs) - return 0; - - if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) - return -EINVAL; - - if (!user_range->flags) - return -EINVAL; - - bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); - if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) - return -EINVAL; - - bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); - if (IS_ERR(bitmap)) - return PTR_ERR(bitmap); - - msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { - .flags = user_range->flags, - .base = user_range->base, - .nmsrs = user_range->nmsrs, - .bitmap = bitmap, - }; - - msr_filter->count++; - return 0; -} - -static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, - struct kvm_msr_filter *filter) -{ - struct kvm_x86_msr_filter *new_filter, *old_filter; - bool default_allow; - bool empty = true; - int r; - u32 i; - - if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) - empty &= !filter->ranges[i].nmsrs; - - default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); - if (empty && !default_allow) - return -EINVAL; - - new_filter = kvm_alloc_msr_filter(default_allow); - if (!new_filter) - return -ENOMEM; - - for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { - r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); - if (r) { - kvm_free_msr_filter(new_filter); - return r; - } - } - - mutex_lock(&kvm->lock); - old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, - mutex_is_locked(&kvm->lock)); - mutex_unlock(&kvm->lock); - synchronize_srcu(&kvm->srcu); - - kvm_free_msr_filter(old_filter); - - /* - * Recalc MSR intercepts as userspace may want to intercept accesses to - * MSRs that KVM would otherwise pass through to the guest. - */ - kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); - - return 0; -} - #ifdef CONFIG_KVM_COMPAT /* for KVM_X86_SET_MSR_FILTER */ struct kvm_msr_filter_range_compat { @@ -7621,157 +4717,6 @@ out: return r; } -static void kvm_probe_feature_msr(u32 msr_index) -{ - u64 data; - - if (kvm_get_feature_msr(NULL, msr_index, &data, true)) - return; - - msr_based_features[num_msr_based_features++] = msr_index; -} - -static void kvm_probe_msr_to_save(u32 msr_index) -{ - u32 dummy[2]; - - if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) - return; - - /* - * Even MSRs that are valid in the host may not be exposed to guests in - * some cases. - */ - switch (msr_index) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - return; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - return; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - return; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - return; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - return; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - return; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (msr_index - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) - return; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... - MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1: - if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= - kvm_pmu_cap.num_counters_gp) - return; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... - MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1: - if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= - kvm_pmu_cap.num_counters_gp) - return; - break; - case MSR_ARCH_PERFMON_FIXED_CTR0 ... - MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1: - if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= - kvm_pmu_cap.num_counters_fixed) - return; - break; - case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: - case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: - case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: - case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: - if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) - return; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - return; - break; - case MSR_IA32_TSX_CTRL: - if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) - return; - break; - case MSR_IA32_XSS: - if (!kvm_caps.supported_xss) - return; - break; - case MSR_IA32_U_CET: - case MSR_IA32_S_CET: - if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && - !kvm_cpu_cap_has(X86_FEATURE_IBT)) - return; - break; - case MSR_IA32_INT_SSP_TAB: - if (!kvm_cpu_cap_has(X86_FEATURE_LM)) - return; - fallthrough; - case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: - if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) - return; - break; - default: - break; - } - - msrs_to_save[num_msrs_to_save++] = msr_index; -} - -static void kvm_init_msr_lists(void) -{ - unsigned i; - - BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3, - "Please update the fixed PMCs in msrs_to_save_pmu[]"); - - num_msrs_to_save = 0; - num_emulated_msrs = 0; - num_msr_based_features = 0; - - for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) - kvm_probe_msr_to_save(msrs_to_save_base[i]); - - if (enable_pmu) { - for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) - kvm_probe_msr_to_save(msrs_to_save_pmu[i]); - } - - for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_call(has_emulated_msr)(NULL, - emulated_msrs_all[i])) - continue; - - emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; - } - - for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++) - kvm_probe_feature_msr(i); - - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) - kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]); -} - static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *__v) { @@ -7821,36 +4766,24 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return handled; } -void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_call(set_segment)(vcpu, var, seg); -} - -void kvm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_call(get_segment)(vcpu, var, seg); -} - gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); + return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); + return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write); @@ -7858,21 +4791,21 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; - return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception); + return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); + gpa_t gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -7900,14 +4833,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; /* Inline kvm_read_guest_virt_helper for speed. */ - gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, - exception); + gpa_t gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, addr, access|PFERR_FETCH_MASK, + exception); if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; @@ -7959,12 +4892,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); + gpa_t gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -8065,7 +4998,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); @@ -8075,7 +5008,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * shadow page table for L2 guest. */ if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || - !permission_fault(vcpu, vcpu->arch.walk_mmu, + !permission_fault(vcpu, gva_walk, vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); @@ -8083,7 +5016,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 1; } - *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); + *gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception); if (*gpa == INVALID_GPA) return -1; @@ -8492,11 +5425,6 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); } -static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - return kvm_x86_call(get_segment_base)(vcpu, seg); -} - static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) { kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); @@ -8641,7 +5569,7 @@ static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) static unsigned long emulator_get_cached_segment_base( struct x86_emulate_ctxt *ctxt, int seg) { - return get_segment_base(emul_to_vcpu(ctxt), seg); + return kvm_get_segment_base(emul_to_vcpu(ctxt), seg); } static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, @@ -8714,61 +5642,22 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int r; - - r = kvm_emulate_msr_read(vcpu, msr_index, pdata); - if (r < 0) - return X86EMUL_UNHANDLEABLE; - if (r) { - if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, - complete_emulated_rdmsr, r)) - return X86EMUL_IO_NEEDED; - - trace_kvm_msr_read_ex(msr_index); - return X86EMUL_PROPAGATE_FAULT; - } - - trace_kvm_msr_read(msr_index, *pdata); - return X86EMUL_CONTINUE; + return kvm_emulator_get_msr_with_filter(vcpu, msr_index, pdata); } static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int r; - - r = kvm_emulate_msr_write(vcpu, msr_index, data); - if (r < 0) - return X86EMUL_UNHANDLEABLE; - if (r) { - if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, - complete_emulated_msr_access, r)) - return X86EMUL_IO_NEEDED; - - trace_kvm_msr_write_ex(msr_index, data); - return X86EMUL_PROPAGATE_FAULT; - } - - trace_kvm_msr_write(msr_index, data); - return X86EMUL_CONTINUE; + return kvm_emulator_set_msr_with_filter(vcpu, msr_index, data); } static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - /* - * Treat emulator accesses to the current shadow stack pointer as host- - * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), - * and this API is used only for implicit accesses, i.e. not RDMSR, and - * so the index is fully KVM-controlled. - */ - if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) - return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); - - return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); + return kvm_emulator_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) @@ -11596,7 +8485,7 @@ bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) return true; - if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) + if (kvm_is_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) return true; if (kvm_hv_has_stimer_pending(vcpu)) @@ -11902,28 +8791,6 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -/* Swap (qemu) user FPU context for the guest FPU context. */ -static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) - return; - - /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ - fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); - trace_kvm_fpu(1); -} - -/* When vcpu_run ends, restore user space FPU context. */ -static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) - return; - - fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); - ++vcpu->stat.fpu_reload; - trace_kvm_fpu(0); -} - static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) { /* @@ -12073,179 +8940,6 @@ out: return r; } -static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { - /* - * We are here if userspace calls get_regs() in the middle of - * instruction emulation. Registers state needs to be copied - * back from emulation context to vcpu. Userspace shouldn't do - * that usually, but some bad designed PV devices (vmware - * backdoor interface) need this to work - */ - emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); - vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - } - regs->rax = kvm_rax_read_raw(vcpu); - regs->rbx = kvm_rbx_read_raw(vcpu); - regs->rcx = kvm_rcx_read_raw(vcpu); - regs->rdx = kvm_rdx_read_raw(vcpu); - regs->rsi = kvm_rsi_read_raw(vcpu); - regs->rdi = kvm_rdi_read_raw(vcpu); - regs->rsp = kvm_rsp_read(vcpu); - regs->rbp = kvm_rbp_read_raw(vcpu); -#ifdef CONFIG_X86_64 - regs->r8 = kvm_r8_read_raw(vcpu); - regs->r9 = kvm_r9_read_raw(vcpu); - regs->r10 = kvm_r10_read_raw(vcpu); - regs->r11 = kvm_r11_read_raw(vcpu); - regs->r12 = kvm_r12_read_raw(vcpu); - regs->r13 = kvm_r13_read_raw(vcpu); - regs->r14 = kvm_r14_read_raw(vcpu); - regs->r15 = kvm_r15_read_raw(vcpu); -#endif - - regs->rip = kvm_rip_read(vcpu); - regs->rflags = kvm_get_rflags(vcpu); -} - -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - if (vcpu->kvm->arch.has_protected_state && - vcpu->arch.guest_state_protected) - return -EINVAL; - - vcpu_load(vcpu); - __get_regs(vcpu, regs); - vcpu_put(vcpu); - return 0; -} - -static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu->arch.emulate_regs_need_sync_from_vcpu = true; - vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - - kvm_rax_write_raw(vcpu, regs->rax); - kvm_rbx_write_raw(vcpu, regs->rbx); - kvm_rcx_write_raw(vcpu, regs->rcx); - kvm_rdx_write_raw(vcpu, regs->rdx); - kvm_rsi_write_raw(vcpu, regs->rsi); - kvm_rdi_write_raw(vcpu, regs->rdi); - kvm_rsp_write(vcpu, regs->rsp); - kvm_rbp_write_raw(vcpu, regs->rbp); -#ifdef CONFIG_X86_64 - kvm_r8_write_raw(vcpu, regs->r8); - kvm_r9_write_raw(vcpu, regs->r9); - kvm_r10_write_raw(vcpu, regs->r10); - kvm_r11_write_raw(vcpu, regs->r11); - kvm_r12_write_raw(vcpu, regs->r12); - kvm_r13_write_raw(vcpu, regs->r13); - kvm_r14_write_raw(vcpu, regs->r14); - kvm_r15_write_raw(vcpu, regs->r15); -#endif - - kvm_rip_write(vcpu, regs->rip); - kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception_vmexit.pending = false; - - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - if (vcpu->kvm->arch.has_protected_state && - vcpu->arch.guest_state_protected) - return -EINVAL; - - vcpu_load(vcpu); - __set_regs(vcpu, regs); - vcpu_put(vcpu); - return 0; -} - -static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - struct desc_ptr dt; - - if (vcpu->arch.guest_state_protected) - goto skip_protected_regs; - - kvm_handle_exception_payload_quirk(vcpu); - - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); - kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - kvm_x86_call(get_idt)(vcpu, &dt); - sregs->idt.limit = dt.size; - sregs->idt.base = dt.address; - kvm_x86_call(get_gdt)(vcpu, &dt); - sregs->gdt.limit = dt.size; - sregs->gdt.base = dt.address; - - sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = kvm_read_cr3(vcpu); - -skip_protected_regs: - sregs->cr0 = kvm_read_cr0(vcpu); - sregs->cr4 = kvm_read_cr4(vcpu); - sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.efer; - sregs->apic_base = vcpu->arch.apic_base; -} - -static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - __get_sregs_common(vcpu, sregs); - - if (vcpu->arch.guest_state_protected) - return; - - if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) - set_bit(vcpu->arch.interrupt.nr, - (unsigned long *)sregs->interrupt_bitmap); -} - -static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) -{ - int i; - - __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2); - - if (vcpu->arch.guest_state_protected) - return; - - if (is_pae_paging(vcpu)) { - kvm_vcpu_srcu_read_lock(vcpu); - for (i = 0 ; i < 4 ; i++) - sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); - sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; - kvm_vcpu_srcu_read_unlock(vcpu); - } -} - -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - if (vcpu->kvm->arch.has_protected_state && - vcpu->arch.guest_state_protected) - return -EINVAL; - - vcpu_load(vcpu); - __get_sregs(vcpu, sregs); - vcpu_put(vcpu); - return 0; -} - int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { @@ -12365,173 +9059,6 @@ unhandled_task_switch: } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch); -static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { - /* - * When EFER.LME and CR0.PG are set, the processor is in - * 64-bit mode (though maybe in a 32-bit code segment). - * CR4.PAE and EFER.LMA must be set. - */ - if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) - return false; - if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3)) - return false; - } else { - /* - * Not in 64-bit mode: EFER.LMA is clear and the code - * segment cannot be 64-bit. - */ - if (sregs->efer & EFER_LMA || sregs->cs.l) - return false; - } - - return kvm_is_valid_cr4(vcpu, sregs->cr4) && - kvm_is_valid_cr0(vcpu, sregs->cr0); -} - -static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, - int *mmu_reset_needed, bool update_pdptrs) -{ - int idx; - struct desc_ptr dt; - - if (!kvm_is_valid_sregs(vcpu, sregs)) - return -EINVAL; - - if (kvm_apic_set_base(vcpu, sregs->apic_base, true)) - return -EINVAL; - - if (vcpu->arch.guest_state_protected) - return 0; - - dt.size = sregs->idt.limit; - dt.address = sregs->idt.base; - kvm_x86_call(set_idt)(vcpu, &dt); - dt.size = sregs->gdt.limit; - dt.address = sregs->gdt.base; - kvm_x86_call(set_gdt)(vcpu, &dt); - - vcpu->arch.cr2 = sregs->cr2; - *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; - kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); - kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); - - *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_call(set_efer)(vcpu, sregs->efer); - - *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_call(set_cr0)(vcpu, sregs->cr0); - - *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - kvm_x86_call(set_cr4)(vcpu, sregs->cr4); - - if (update_pdptrs) { - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, kvm_read_cr3(vcpu)); - *mmu_reset_needed = 1; - } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - } - - kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - kvm_set_cr8(vcpu, sregs->cr8); - - /* Older userspace won't unhalt the vcpu on reset. */ - if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && - sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !is_protmode(vcpu)) - kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); - - return 0; -} - -static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) -{ - int pending_vec, max_bits; - int mmu_reset_needed = 0; - int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true); - - if (ret) - return ret; - - if (mmu_reset_needed) { - kvm_mmu_reset_context(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); - } - - max_bits = KVM_NR_INTERRUPTS; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); - - if (pending_vec < max_bits) { - kvm_queue_interrupt(vcpu, pending_vec, false); - pr_debug("Set back pending irq %d\n", pending_vec); - kvm_make_request(KVM_REQ_EVENT, vcpu); - } - return 0; -} - -static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) -{ - int mmu_reset_needed = 0; - bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; - bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && - !(sregs2->efer & EFER_LMA); - int i, ret; - - if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) - return -EINVAL; - - if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) - return -EINVAL; - - ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2, - &mmu_reset_needed, !valid_pdptrs); - if (ret) - return ret; - - if (valid_pdptrs) { - for (i = 0; i < 4 ; i++) - kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); - - kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR); - mmu_reset_needed = 1; - vcpu->arch.pdptrs_from_userspace = true; - } - if (mmu_reset_needed) { - kvm_mmu_reset_context(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); - } - return 0; -} - -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - int ret; - - if (vcpu->kvm->arch.has_protected_state && - vcpu->arch.guest_state_protected) - return -EINVAL; - - vcpu_load(vcpu); - ret = __set_sregs(vcpu, sregs); - vcpu_put(vcpu); - return ret; -} - static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) { bool set = false; @@ -12687,13 +9214,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) static void store_regs(struct kvm_vcpu *vcpu) { - BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES); - - if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS) - __get_regs(vcpu, &vcpu->run->s.regs.regs); - - if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS) - __get_sregs(vcpu, &vcpu->run->s.regs.sregs); + kvm_run_sync_regs_to_user(vcpu); if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS) kvm_vcpu_ioctl_x86_get_vcpu_events( @@ -12702,19 +9223,8 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { - __set_regs(vcpu, &vcpu->run->s.regs.regs); - vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; - } - - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { - struct kvm_sregs sregs = vcpu->run->s.regs.sregs; - - if (__set_sregs(vcpu, &sregs)) - return -EINVAL; - - vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; - } + if (kvm_run_sync_regs_from_user(vcpu)) + return -EINVAL; if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { struct kvm_vcpu_events events = vcpu->run->s.regs.events; @@ -13457,13 +9967,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm->arch.created_mediated_pmu) perf_release_mediated_pmu(); kvm_destroy_vcpus(kvm); - kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); + kvm_free_msr_filter((void * __force)kvm->arch.msr_filter); #ifdef CONFIG_KVM_IOAPIC kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); #endif kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); - kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); + kfree((void * __force)kvm->arch.pmu_event_filter); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); @@ -13811,56 +10321,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - return kvm_x86_call(interrupt_allowed)(vcpu, false); -} - -unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) -{ - /* Can't read the RIP when guest state is protected, just return 0 */ - if (vcpu->arch.guest_state_protected) - return 0; - - if (is_64_bit_mode(vcpu)) - return kvm_rip_read(vcpu); - return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + - kvm_rip_read(vcpu)); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip); - -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) -{ - return kvm_get_linear_rip(vcpu) == linear_rip; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip); - -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags; - - rflags = kvm_x86_call(get_rflags)(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~X86_EFLAGS_TF; - return rflags; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags); - -static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) - rflags |= X86_EFLAGS_TF; - kvm_x86_call(set_rflags)(vcpu, rflags); -} - -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - __kvm_set_rflags(vcpu, rflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); - static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); @@ -13996,7 +10456,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_arch_interrupt_allowed(vcpu); + return kvm_is_interrupt_allowed(vcpu); } bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -14139,43 +10599,17 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) #endif #endif -int kvm_spec_ctrl_test_value(u64 value) -{ - /* - * test that setting IA32_SPEC_CTRL to given value - * is allowed by the host processor - */ - - u64 saved_value; - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - - if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) - ret = 1; - else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) - ret = 1; - else - wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); - - local_irq_restore(flags); - - return ret; -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); - void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk; struct x86_exception fault; u64 access = error_code & (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { + gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, &fault) != INVALID_GPA) { /* - * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * If gva_walk->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed * with the error code that the processor gave. */ @@ -14186,7 +10620,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c fault.address = gva; fault.async_page_fault = false; } - vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault, true); + gva_walk->inject_page_fault(vcpu, &fault, true); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de577ef9c97..8ece468087a8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -6,53 +6,16 @@ #include <asm/fpu/xstate.h> #include <asm/mce.h> #include <asm/pvclock.h> +#include "msrs.h" +#include "mmu.h" #include "regs.h" #include "kvm_emulate.h" #include "cpuid.h" #define KVM_MAX_MCE_BANKS 32 -struct kvm_caps { - /* control of guest tsc rate supported? */ - bool has_tsc_control; - /* maximum supported tsc_khz for guests */ - u32 max_guest_tsc_khz; - /* number of bits of the fractional part of the TSC scaling ratio */ - u8 tsc_scaling_ratio_frac_bits; - /* maximum allowed value of TSC scaling ratio */ - u64 max_tsc_scaling_ratio; - /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ - u64 default_tsc_scaling_ratio; - /* bus lock detection supported? */ - bool has_bus_lock_exit; - /* notify VM exit supported? */ - bool has_notify_vmexit; - /* bit mask of VM types */ - u32 supported_vm_types; - - u64 supported_mce_cap; - u64 supported_xcr0; - u64 supported_xss; - u64 supported_perf_cap; - - u64 supported_quirks; - u64 inapplicable_quirks; -}; - -struct kvm_host_values { - /* - * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical - * address bits irrespective of features that repurpose legal bits, - * e.g. MKTME. - */ - u8 maxphyaddr; - - u64 efer; - u64 xcr0; - u64 xss; - u64 s_cet; - u64 arch_capabilities; -}; +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); void kvm_spurious_fault(void); @@ -86,14 +49,6 @@ do { \ failed; \ }) -/* - * The first...last VMX feature MSRs that are emulated by KVM. This may or may - * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an - * associated feature that KVM supports for nested virtualization. - */ -#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC -#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC - #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 #define KVM_DEFAULT_PLE_WINDOW_GROW 2 @@ -102,16 +57,6 @@ do { \ #define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX #define KVM_SVM_DEFAULT_PLE_WINDOW 3000 -/* - * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves - * are arbitrary and have no meaning, the only requirement is that they don't - * conflict with "real" MSRs that KVM supports. Use values at the upper end - * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values - * will be usable until KVM exhausts its supply of paravirtual MSR indices. - */ - -#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff - static inline unsigned int __grow_ple_window(unsigned int val, unsigned int base, unsigned int modifier, unsigned int max) { @@ -142,9 +87,6 @@ static inline unsigned int __shrink_ple_window(unsigned int val, return max(val, min); } -#define MSR_IA32_CR_PAT_DEFAULT \ - PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) - void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); @@ -252,11 +194,6 @@ static inline bool x86_exception_has_error_code(unsigned int vector) return (1U << vector) & exception_has_error_code; } -static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48; @@ -384,6 +321,8 @@ static __always_inline void kvm_request_l1tf_flush_l1d(void) #endif } +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); + void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); @@ -391,6 +330,29 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); int kvm_guest_time_update(struct kvm_vcpu *v); +void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value); +u64 kvm_scale_tsc(u64 tsc, u64 ratio); +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); +u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc); +void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset); + +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.l1_tsc_offset + adjustment); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) + WARN_ON(adjustment < 0); + adjustment = kvm_scale_tsc((u64) adjustment, + vcpu->arch.l1_tsc_scaling_ratio); + adjust_tsc_offset_guest(vcpu, adjustment); +} + int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); @@ -403,21 +365,310 @@ int handle_ud(struct kvm_vcpu *vcpu); void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, struct kvm_queued_exception *ex); +void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu); -int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); -int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); -fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); +/* + * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing + * userspace I/O) to indicate that the emulation context + * should be reused as is, i.e. skip initialization of + * emulation context, instruction fetch and decode. + * + * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. + * Indicates that only select instructions (tagged with + * EmulateOnUD) should be emulated (to minimize the emulator + * attack surface). See also EMULTYPE_TRAP_UD_FORCED. + * + * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to + * decode the instruction length. For use *only* by + * kvm_x86_ops.skip_emulated_instruction() implementations if + * EMULTYPE_COMPLETE_USER_EXIT is not set. + * + * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to + * retry native execution under certain conditions, + * Can only be set in conjunction with EMULTYPE_PF. + * + * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was + * triggered by KVM's magic "force emulation" prefix, + * which is opt in via module param (off by default). + * Bypasses EmulateOnUD restriction despite emulating + * due to an intercepted #UD (see EMULTYPE_TRAP_UD). + * Used to test the full emulator from userspace. + * + * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware + * backdoor emulation, which is opt in via module param. + * VMware backdoor emulation handles select instructions + * and reinjects the #GP for all other cases. + * + * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case + * the CR2/GPA value pass on the stack is valid. + * + * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility + * state and inject single-step #DBs after skipping + * an instruction (after completing userspace I/O). + * + * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that + * is attempting to write a gfn that contains one or + * more of the PTEs used to translate the write itself, + * and the owning page table is being shadowed by KVM. + * If emulation of the faulting instruction fails and + * this flag is set, KVM will exit to userspace instead + * of retrying emulation as KVM cannot make forward + * progress. + * + * If emulation fails for a write to guest page tables, + * KVM unprotects (zaps) the shadow page for the target + * gfn and resumes the guest to retry the non-emulatable + * instruction (on hardware). Unprotecting the gfn + * doesn't allow forward progress for a self-changing + * access because doing so also zaps the translation for + * the gfn, i.e. retrying the instruction will hit a + * !PRESENT fault, which results in a new shadow page + * and sends KVM back to square one. + * + * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip + * an instruction if it could generate a given software + * interrupt, which must be encoded via + * EMULTYPE_SET_SOFT_INT_VECTOR(). + */ +#define EMULTYPE_NO_DECODE (1 << 0) +#define EMULTYPE_TRAP_UD (1 << 1) +#define EMULTYPE_SKIP (1 << 2) +#define EMULTYPE_ALLOW_RETRY_PF (1 << 3) +#define EMULTYPE_TRAP_UD_FORCED (1 << 4) +#define EMULTYPE_VMWARE_GP (1 << 5) +#define EMULTYPE_PF (1 << 6) +#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) +#define EMULTYPE_WRITE_PF_TO_SP (1 << 8) +#define EMULTYPE_SKIP_SOFT_INT (1 << 9) + +#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16) +#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff) + +static inline bool kvm_can_emulate_event_vectoring(int emul_type) +{ + return !(emul_type & EMULTYPE_PF); +} + +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len); +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, + u64 *data, u8 ndata); +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); + +void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason); + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); -extern struct kvm_caps kvm_caps; -extern struct kvm_host_values kvm_host; +int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); +int kvm_emulate_invd(struct kvm_vcpu *vcpu); +int kvm_emulate_mwait(struct kvm_vcpu *vcpu); +int kvm_handle_invalid_op(struct kvm_vcpu *vcpu); +int kvm_emulate_monitor(struct kvm_vcpu *vcpu); + +int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_emulate_halt(struct kvm_vcpu *vcpu); +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); + +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); + +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); + +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); +int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); + +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault, + bool from_hardware); +void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault, + bool from_hardware); + +static inline void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + __kvm_inject_emulated_page_fault(vcpu, fault, false); +} + +bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); + +static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) +{ + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); +} + +void kvm_inject_nmi(struct kvm_vcpu *vcpu); +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); + +void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size); +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); + +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); + +enum kvm_apicv_inhibit { + + /********************************************************************/ + /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */ + /********************************************************************/ + + /* + * APIC acceleration is disabled by a module parameter + * and/or not supported in hardware. + */ + APICV_INHIBIT_REASON_DISABLED, + + /* + * APIC acceleration is inhibited because AutoEOI feature is + * being used by a HyperV guest. + */ + APICV_INHIBIT_REASON_HYPERV, + + /* + * APIC acceleration is inhibited because the userspace didn't yet + * enable the kernel/split irqchip. + */ + APICV_INHIBIT_REASON_ABSENT, + + /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ + * (out of band, debug measure of blocking all interrupts on this vCPU) + * was enabled, to avoid AVIC/APICv bypassing it. + */ + APICV_INHIBIT_REASON_BLOCKIRQ, + + /* + * APICv is disabled because not all vCPUs have a 1:1 mapping between + * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, + + /* + * For simplicity, the APIC acceleration is inhibited + * first time either APIC ID or APIC base are changed by the guest + * from their reset values. + */ + APICV_INHIBIT_REASON_APIC_ID_MODIFIED, + APICV_INHIBIT_REASON_APIC_BASE_MODIFIED, + + /******************************************************/ + /* INHIBITs that are relevant only to the AMD's AVIC. */ + /******************************************************/ + + /* + * AVIC is inhibited on a vCPU because it runs a nested guest. + * + * This is needed because unlike APICv, the peers of this vCPU + * cannot use the doorbell mechanism to signal interrupts via AVIC when + * a vCPU runs nested. + */ + APICV_INHIBIT_REASON_NESTED, + + /* + * On SVM, the wait for the IRQ window is implemented with pending vIRQ, + * which cannot be injected when the AVIC is enabled, thus AVIC + * is inhibited while KVM waits for IRQ window. + */ + APICV_INHIBIT_REASON_IRQWIN, + + /* + * PIT (i8254) 're-inject' mode, relies on EOI intercept, + * which AVIC doesn't support for edge triggered interrupts. + */ + APICV_INHIBIT_REASON_PIT_REINJ, + + /* + * AVIC is disabled because SEV doesn't support it. + */ + APICV_INHIBIT_REASON_SEV, + + /* + * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 + * mapping between logical ID and vCPU. + */ + APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, + + /* + * AVIC is disabled because the vCPU's APIC ID is beyond the max + * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG, + + NR_APICV_INHIBIT_REASONS, +}; + +#define __APICV_INHIBIT_REASON(reason) \ + { BIT(APICV_INHIBIT_REASON_##reason), #reason } + +#define APICV_INHIBIT_REASONS \ + __APICV_INHIBIT_REASON(DISABLED), \ + __APICV_INHIBIT_REASON(HYPERV), \ + __APICV_INHIBIT_REASON(ABSENT), \ + __APICV_INHIBIT_REASON(BLOCKIRQ), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \ + __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \ + __APICV_INHIBIT_REASON(NESTED), \ + __APICV_INHIBIT_REASON(IRQWIN), \ + __APICV_INHIBIT_REASON(PIT_REINJ), \ + __APICV_INHIBIT_REASON(SEV), \ + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) + +bool kvm_apicv_activated(struct kvm *kvm); +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set); +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set); + +static inline void kvm_set_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason) +{ + kvm_set_or_clear_apicv_inhibit(kvm, reason, true); +} + +static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason) +{ + kvm_set_or_clear_apicv_inhibit(kvm, reason, false); +} + +void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc); + +static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm) +{ + kvm_inc_or_dec_irq_window_inhibit(kvm, true); +} + +static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm) +{ + kvm_inc_or_dec_irq_window_inhibit(kvm, false); +} + +void kvm_make_scan_ioapic_request(struct kvm *kvm); +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap); void kvm_setup_xss_caps(void); @@ -461,22 +712,6 @@ extern bool enable_vmware_backdoor; extern int pi_inject_timer; -extern bool report_ignored_msrs; - -extern bool eager_page_split; - -static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); -} - -static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) -{ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); -} - static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, @@ -575,6 +810,8 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } +bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); + /* * Trigger machine check on the host. We assume all the MSRs are already set up * by the CPU and that we still run on the same CPU as the MCE occurred on. @@ -594,32 +831,10 @@ static inline void kvm_machine_check(void) #endif } -int kvm_spec_ctrl_test_value(u64 value); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); +void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); -bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); - -enum kvm_msr_access { - MSR_TYPE_R = BIT(0), - MSR_TYPE_W = BIT(1), - MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, -}; - -/* - * Internal error codes that are used to indicate that MSR emulation encountered - * an error that should result in #GP in the guest, unless userspace handles it. - * Note, '1', '0', and negative numbers are off limits, as they are used by KVM - * as part of KVM's lightly documented internal KVM_RUN return codes. - * - * UNSUPPORTED - The MSR isn't supported, either because it is completely - * unknown to KVM, or because the MSR should not exist according - * to the vCPU model. - * - * FILTERED - Access to the MSR is denied by a userspace MSR filter. - */ -#define KVM_MSR_RET_UNSUPPORTED 2 -#define KVM_MSR_RET_FILTERED 3 int kvm_sev_es_mmio(struct kvm_vcpu *vcpu, bool is_write, gpa_t gpa, unsigned int bytes, void *data); @@ -679,27 +894,4 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -#define CET_US_RESERVED_BITS GENMASK(9, 6) -#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) -#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) -#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12) - -static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) -{ - if (data & CET_US_RESERVED_BITS) - return false; - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && - (data & CET_US_SHSTK_MASK_BITS)) - return false; - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && - (data & CET_US_IBT_MASK_BITS)) - return false; - if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4)) - return false; - /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */ - if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR)) - return false; - - return true; -} #endif diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index d28a057fa6c2..4ace12606e93 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -120,6 +120,7 @@ TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test TEST_GEN_PROGS_x86 += x86/svm_nested_vmcb12_gpa TEST_GEN_PROGS_x86 += x86/svm_nested_pat_test TEST_GEN_PROGS_x86 += x86/svm_lbr_nested_state +TEST_GEN_PROGS_x86 += x86/svm_pmu_host_guest_test TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync TEST_GEN_PROGS_x86 += x86/sync_regs_test TEST_GEN_PROGS_x86 += x86/ucna_injection_test diff --git a/tools/testing/selftests/kvm/include/x86/pmu.h b/tools/testing/selftests/kvm/include/x86/pmu.h index 98537cc8840d..608ed83d7c6a 100644 --- a/tools/testing/selftests/kvm/include/x86/pmu.h +++ b/tools/testing/selftests/kvm/include/x86/pmu.h @@ -38,6 +38,12 @@ #define ARCH_PERFMON_EVENTSEL_INV BIT_ULL(23) #define ARCH_PERFMON_EVENTSEL_CMASK GENMASK_ULL(31, 24) +/* + * These are AMD-specific bits. + */ +#define AMD64_EVENTSEL_GUESTONLY BIT_ULL(40) +#define AMD64_EVENTSEL_HOSTONLY BIT_ULL(41) + /* RDPMC control flags, Intel only. */ #define INTEL_RDPMC_METRICS BIT_ULL(29) #define INTEL_RDPMC_FIXED BIT_ULL(30) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 513e4a1075fa..7d3a27bc0d84 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1226,6 +1226,8 @@ struct idt_entry { void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); +gva_t vm_alloc_stack(struct kvm_vm *vm, int nr_pages); + /* * Exception fixup morphs #DE to an arbitrary magic vector so that '0' can be * used to signal "no expcetion". @@ -1392,6 +1394,14 @@ static inline bool kvm_is_pmu_enabled(void) return get_kvm_param_bool("enable_pmu"); } +static inline bool kvm_is_mediated_pmu_enabled(void) +{ + if (host_cpu_is_intel) + return get_kvm_intel_param_bool("enable_mediated_pmu"); + + return get_kvm_amd_param_bool("enable_mediated_pmu"); +} + static inline bool kvm_is_forced_emulation_enabled(void) { return !!get_kvm_param_integer("force_emulation_prefix"); diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h index 6c013eb838be..c201c30485e7 100644 --- a/tools/testing/selftests/kvm/include/x86/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86/svm_util.h @@ -28,6 +28,9 @@ struct svm_test_data { void *msr_hva; u64 msr_gpa; + /* Stack */ + void *stack; /* gva */ + /* NPT */ u64 ncr3_gpa; }; @@ -57,7 +60,7 @@ static inline void vmmcall(void) ) struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, gva_t *p_svm_gva); -void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); +void generic_svm_setup(struct svm_test_data *svm, void *guest_rip); void run_guest(struct vmcb *vmcb, u64 vmcb_gpa); static inline bool kvm_cpu_has_npt(void) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index 90fffaf91595..4bcfd60e3aec 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -524,6 +524,8 @@ struct vmx_pages { u64 apic_access_gpa; void *apic_access; + void *stack; + u64 eptp_gpa; }; @@ -552,7 +554,7 @@ union vmx_ctrl_msr { struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, gva_t *p_vmx_gva); bool prepare_for_vmx_operation(struct vmx_pages *vmx); -void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); +void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip); bool load_vmcs(struct vmx_pages *vmx); bool ept_1g_pages_supported(void); diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 61cf952cd2dc..e19e8b5a09c5 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -30,21 +30,15 @@ __asm__( " ud2;" ); -#define L2_GUEST_STACK_SIZE 64 - static void l1_vmx_code(struct vmx_pages *vmx, u64 vcpu_id) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - unsigned long *rsp; - GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); GUEST_ASSERT(ept_1g_pages_supported()); - rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; - *rsp = vcpu_id; - prepare_vmcs(vmx, memstress_l2_guest_entry, rsp); + *(u64 *)vmx->stack = vcpu_id; + prepare_vmcs(vmx, memstress_l2_guest_entry); GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL); @@ -53,13 +47,8 @@ static void l1_vmx_code(struct vmx_pages *vmx, u64 vcpu_id) static void l1_svm_code(struct svm_test_data *svm, u64 vcpu_id) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - unsigned long *rsp; - - - rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; - *rsp = vcpu_id; - generic_svm_setup(svm, memstress_l2_guest_entry, rsp); + *(u64 *)svm->stack = vcpu_id; + generic_svm_setup(svm, memstress_l2_guest_entry); run_guest(svm->vmcb, svm->vmcb_gpa); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index 4ca48de7a926..ef56dcefe011 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -778,6 +778,30 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) REPORT_GUEST_ASSERT(uc); } +gva_t vm_alloc_stack(struct kvm_vm *vm, int nr_pages) +{ + int size = nr_pages * getpagesize(); + gva_t stack_gva; + + stack_gva = __vm_alloc(vm, size, DEFAULT_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA); + stack_gva += size; + + /* + * Align stack to match calling sequence requirements in section "The + * Stack Frame" of the System V ABI AMD64 Architecture Processor + * Supplement, which requires the value (%rsp + 8) to be a multiple of + * 16 when control is transferred to the function entry point. + * + * If this code is ever used to launch a vCPU with 32-bit entry point it + * may need to subtract 4 bytes instead of 8 bytes. + */ + TEST_ASSERT(IS_ALIGNED(stack_gva, PAGE_SIZE), + "__vm_alloc() did not provide a page-aligned address"); + stack_gva -= 8; + + return stack_gva; +} + void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus) { int r; @@ -820,27 +844,8 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, u32 vcpu_id) { struct kvm_mp_state mp_state; struct kvm_regs regs; - gva_t stack_gva; struct kvm_vcpu *vcpu; - stack_gva = __vm_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), - DEFAULT_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA); - - stack_gva += DEFAULT_STACK_PGS * getpagesize(); - - /* - * Align stack to match calling sequence requirements in section "The - * Stack Frame" of the System V ABI AMD64 Architecture Processor - * Supplement, which requires the value (%rsp + 8) to be a multiple of - * 16 when control is transferred to the function entry point. - * - * If this code is ever used to launch a vCPU with 32-bit entry point it - * may need to subtract 4 bytes instead of 8 bytes. - */ - TEST_ASSERT(IS_ALIGNED(stack_gva, PAGE_SIZE), - "__vm_alloc() did not provide a page-aligned address"); - stack_gva -= 8; - vcpu = __vm_vcpu_add(vm, vcpu_id); vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); vcpu_init_sregs(vm, vcpu); @@ -849,7 +854,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, u32 vcpu_id) /* Setup guest general purpose registers */ vcpu_regs_get(vcpu, ®s); regs.rflags = regs.rflags | X86_EFLAGS_FIXED; - regs.rsp = stack_gva; + regs.rsp = vm_alloc_stack(vm, DEFAULT_STACK_PGS); vcpu_regs_set(vcpu, ®s); /* Setup the MP state */ diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index 3b01605ab016..1445b890986f 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -46,6 +46,8 @@ vcpu_alloc_svm(struct kvm_vm *vm, gva_t *p_svm_gva) svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); memset(svm->msr_hva, 0, getpagesize()); + svm->stack = (void *)vm_alloc_stack(vm, 1); + if (vm->stage2_mmu.pgd_created) svm->ncr3_gpa = vm->stage2_mmu.pgd; @@ -81,7 +83,7 @@ void vm_enable_npt(struct kvm_vm *vm) tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks); } -void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) +void generic_svm_setup(struct svm_test_data *svm, void *guest_rip) { struct vmcb *vmcb = svm->vmcb; u64 vmcb_gpa = svm->vmcb_gpa; @@ -122,7 +124,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r ctrl->msrpm_base_pa = svm->msr_gpa; vmcb->save.rip = (u64)guest_rip; - vmcb->save.rsp = (u64)guest_rsp; + vmcb->save.rsp = (u64)svm->stack; guest_regs.rdi = (u64)svm; if (svm->ncr3_gpa) { diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 7c10ba6e6fb4..cd09c9de4485 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -116,6 +116,8 @@ vcpu_alloc_vmx(struct kvm_vm *vm, gva_t *p_vmx_gva) vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); + vmx->stack = (void *)vm_alloc_stack(vm, 1); + if (vm->stage2_mmu.pgd_created) vmx->eptp_gpa = vm->stage2_mmu.pgd; @@ -366,11 +368,11 @@ static inline void init_vmcs_guest_state(void *rip, void *rsp) vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP)); } -void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) +void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip) { init_vmcs_control_fields(vmx); init_vmcs_host_state(); - init_vmcs_guest_state(guest_rip, guest_rsp); + init_vmcs_guest_state(guest_rip, vmx->stack); } bool kvm_cpu_has_ept(void) diff --git a/tools/testing/selftests/kvm/x86/aperfmperf_test.c b/tools/testing/selftests/kvm/x86/aperfmperf_test.c index c91660103137..845cb685f174 100644 --- a/tools/testing/selftests/kvm/x86/aperfmperf_test.c +++ b/tools/testing/selftests/kvm/x86/aperfmperf_test.c @@ -54,8 +54,6 @@ static void guest_read_aperf_mperf(void) GUEST_SYNC2(rdmsr(MSR_IA32_APERF), rdmsr(MSR_IA32_MPERF)); } -#define L2_GUEST_STACK_SIZE 64 - static void l2_guest_code(void) { guest_read_aperf_mperf(); @@ -64,21 +62,18 @@ static void l2_guest_code(void) static void l1_svm_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; - generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); run_guest(vmcb, svm->vmcb_gpa); } static void l1_vmx_code(struct vmx_pages *vmx) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); GUEST_ASSERT_EQ(load_vmcs(vmx), true); - prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx, NULL); /* * Enable MSR bitmaps (the bitmap itself is allocated, zeroed, and set diff --git a/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c b/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c index 5b3aef109cfc..77ce87c41a86 100644 --- a/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c +++ b/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c @@ -52,8 +52,6 @@ static void l2_guest_code(void) static void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; /* Set up Hyper-V enlightenments and eVMCS */ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); @@ -62,8 +60,7 @@ static void guest_code(struct vmx_pages *vmx_pages, GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_evmcs(hv_pages)); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); GUEST_ASSERT(!vmlaunch()); diff --git a/tools/testing/selftests/kvm/x86/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86/hyperv_evmcs.c index c7fa114aee20..1bda2cd3f739 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_evmcs.c +++ b/tools/testing/selftests/kvm/x86/hyperv_evmcs.c @@ -78,9 +78,6 @@ void l2_guest_code(void) void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages, gpa_t hv_hcall_page_gpa) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa); @@ -100,8 +97,7 @@ void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages, GUEST_SYNC(4); GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); GUEST_SYNC(5); GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); diff --git a/tools/testing/selftests/kvm/x86/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86/hyperv_svm_test.c index 7a62f6a9d606..1f74b0fa9b83 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86/hyperv_svm_test.c @@ -18,8 +18,6 @@ #include "svm_util.h" #include "hyperv.h" -#define L2_GUEST_STACK_SIZE 256 - /* Exit to L1 from L2 with RDMSR instruction */ static inline void rdmsr_from_l2(u32 msr) { @@ -69,7 +67,6 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm, struct hyperv_test_pages *hv_pages, gpa_t pgs_gpa) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; @@ -81,8 +78,7 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm, GUEST_ASSERT(svm->vmcb_gpa); /* Prepare for L2 execution. */ - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); /* L2 TLB flush setup */ hve->partition_assist_page = hv_pages->partition_assist_gpa; diff --git a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c index 52014a3210c8..25a182be00a9 100644 --- a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c +++ b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c @@ -26,8 +26,6 @@ static void guest_generate_buslocks(void) atomic_inc(val); } -#define L2_GUEST_STACK_SIZE 64 - static void l2_guest_code(void) { guest_generate_buslocks(); @@ -36,21 +34,18 @@ static void l2_guest_code(void) static void l1_svm_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; - generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); run_guest(vmcb, svm->vmcb_gpa); } static void l1_vmx_code(struct vmx_pages *vmx) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); GUEST_ASSERT_EQ(load_vmcs(vmx), true); - prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx, NULL); GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); GUEST_ASSERT(!vmlaunch()); diff --git a/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c index 761fec293408..b974cfb347d6 100644 --- a/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c +++ b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c @@ -21,8 +21,6 @@ enum { PORT_L0_EXIT = 0x2000, }; -#define L2_GUEST_STACK_SIZE 64 - static void l2_guest_code(void) { /* Exit to L0 */ @@ -32,14 +30,11 @@ static void l2_guest_code(void) static void l1_vmx_code(struct vmx_pages *vmx_pages) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(0); @@ -47,11 +42,8 @@ static void l1_vmx_code(struct vmx_pages *vmx_pages) static void l1_svm_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - /* Prepare the VMCB for L2 execution. */ - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); run_guest(svm->vmcb, svm->vmcb_gpa); GUEST_ASSERT(0); diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c index 0e67cce83570..26b474bf1353 100644 --- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c @@ -40,8 +40,6 @@ #define TEST_HVA(vm, idx) addr_gpa2hva(vm, TEST_GPA(idx)) -#define L2_GUEST_STACK_SIZE 64 - /* Use the page offset bits to communicate the access+fault type. */ #define TEST_SYNC_READ_FAULT BIT(0) #define TEST_SYNC_WRITE_FAULT BIT(1) @@ -92,7 +90,6 @@ static void l2_guest_code_tdp_disabled(void) void l1_vmx_code(struct vmx_pages *vmx) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; void *l2_rip; GUEST_ASSERT(vmx->vmcs_gpa); @@ -104,7 +101,7 @@ void l1_vmx_code(struct vmx_pages *vmx) else l2_rip = l2_guest_code_tdp_disabled; - prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx, l2_rip); GUEST_SYNC(TEST_SYNC_NO_FAULT); GUEST_ASSERT(!vmlaunch()); @@ -115,7 +112,6 @@ void l1_vmx_code(struct vmx_pages *vmx) static void l1_svm_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; void *l2_rip; if (svm->ncr3_gpa) @@ -123,7 +119,7 @@ static void l1_svm_code(struct svm_test_data *svm) else l2_rip = l2_guest_code_tdp_disabled; - generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_rip); GUEST_SYNC(TEST_SYNC_NO_FAULT); run_guest(svm->vmcb, svm->vmcb_gpa); diff --git a/tools/testing/selftests/kvm/x86/nested_emulation_test.c b/tools/testing/selftests/kvm/x86/nested_emulation_test.c index fb7dcbe53ac7..e08c6b0697e5 100644 --- a/tools/testing/selftests/kvm/x86/nested_emulation_test.c +++ b/tools/testing/selftests/kvm/x86/nested_emulation_test.c @@ -57,7 +57,7 @@ static void guest_code(void *test_data) struct svm_test_data *svm = test_data; struct vmcb *vmcb = svm->vmcb; - generic_svm_setup(svm, NULL, NULL); + generic_svm_setup(svm, NULL); vmcb->save.idtr.limit = 0; vmcb->save.rip = (u64)l2_guest_code; @@ -69,7 +69,7 @@ static void guest_code(void *test_data) GUEST_ASSERT(prepare_for_vmx_operation(test_data)); GUEST_ASSERT(load_vmcs(test_data)); - prepare_vmcs(test_data, NULL, NULL); + prepare_vmcs(test_data, NULL); GUEST_ASSERT(!vmwrite(GUEST_IDTR_LIMIT, 0)); GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); GUEST_ASSERT(!vmwrite(EXCEPTION_BITMAP, 0)); diff --git a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c index 186e980aa8ee..aeec3121c8e8 100644 --- a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c +++ b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c @@ -5,8 +5,6 @@ #include "vmx.h" #include "svm_util.h" -#define L2_GUEST_STACK_SIZE 256 - /* * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with * the "real" exceptions used, #SS/#GP/#DF (12/13/8). @@ -91,9 +89,8 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector, static void l1_svm_code(struct svm_test_data *svm) { struct vmcb_control_area *ctrl = &svm->vmcb->control; - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, NULL); svm->vmcb->save.idtr.limit = 0; ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN); @@ -128,13 +125,11 @@ static void vmx_run_l2(void *l2_code, int vector, u32 error_code) static void l1_vmx_code(struct vmx_pages *vmx) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); GUEST_ASSERT_EQ(load_vmcs(vmx), true); - prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx, NULL); GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0); /* diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c index 11fd2467d823..8c2ba9674558 100644 --- a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c +++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c @@ -11,8 +11,6 @@ #include "kselftest.h" -#define L2_GUEST_STACK_SIZE 64 - static void l2_guest_code(void) { vmcall(); @@ -20,11 +18,9 @@ static void l2_guest_code(void) static void l1_svm_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uintptr_t save_cr3; - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); /* Try to run L2 with invalid CR3 and make sure it fails */ save_cr3 = svm->vmcb->save.cr3; @@ -42,14 +38,12 @@ static void l1_svm_code(struct svm_test_data *svm) static void l1_vmx_code(struct vmx_pages *vmx_pages) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uintptr_t save_cr3; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); /* Try to run L2 with invalid CR3 and make sure it fails */ save_cr3 = vmreadz(GUEST_CR3); diff --git a/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c b/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c index fa95568f55ff..2e04563790ff 100644 --- a/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c @@ -9,8 +9,6 @@ #include "svm_util.h" #include "vmx.h" -#define L2_GUEST_STACK_SIZE 64 - enum test_type { TEST_FINAL_PAGE_UNMAPPED, /* Final data page not present */ TEST_PT_PAGE_UNMAPPED, /* Page table page not present */ @@ -54,14 +52,13 @@ static void l2_guest_code_ins(void) static void l1_vmx_code(struct vmx_pages *vmx, u64 expected_fault_gpa, u64 test_type) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u64 exit_qual; GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_entry, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx, l2_entry); GUEST_ASSERT(!vmlaunch()); @@ -120,12 +117,10 @@ static void l1_vmx_code(struct vmx_pages *vmx, u64 expected_fault_gpa, static void l1_svm_code(struct svm_test_data *svm, u64 expected_fault_gpa, u64 test_type) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; u64 exit_info_1; - generic_svm_setup(svm, l2_entry, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_entry); run_guest(vmcb, svm->vmcb_gpa); diff --git a/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c index f0e4adac4751..cb79d7b9619c 100644 --- a/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c @@ -34,8 +34,6 @@ #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) -#define L2_GUEST_STACK_SIZE 64 - enum { PORT_ABORT = 0x1000, PORT_REPORT, @@ -75,8 +73,6 @@ static void l2_guest_code(void) static void l1_guest_code(void *data) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - /* Set TSC from L1 and make sure TSC_ADJUST is updated correctly */ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); @@ -93,8 +89,7 @@ static void l1_guest_code(void *data) GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); @@ -105,8 +100,7 @@ static void l1_guest_code(void *data) } else { struct svm_test_data *svm = data; - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); svm->vmcb->control.tsc_offset = TSC_OFFSET_VALUE; run_guest(svm->vmcb, svm->vmcb_gpa); diff --git a/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c index 190e93af20a1..18f765835bf4 100644 --- a/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c @@ -22,8 +22,6 @@ #define TSC_OFFSET_L2 ((u64)-33125236320908) #define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48) -#define L2_GUEST_STACK_SIZE 64 - enum { USLEEP, UCHECK_L1, UCHECK_L2 }; #define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec) #define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq) @@ -82,13 +80,10 @@ static void l2_guest_code(void) static void l1_svm_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - /* check that L1's frequency looks alright before launching L2 */ check_tsc_freq(UCHECK_L1); - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); /* enable TSC scaling for L2 */ wrmsr(MSR_AMD64_TSC_RATIO, L2_SCALE_FACTOR << 32); @@ -105,7 +100,6 @@ static void l1_svm_code(struct svm_test_data *svm) static void l1_vmx_code(struct vmx_pages *vmx_pages) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u32 control; /* check that L1's frequency looks alright before launching L2 */ @@ -115,7 +109,7 @@ static void l1_vmx_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(load_vmcs(vmx_pages)); /* prepare the VMCS for L2 execution */ - prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); /* enable TSC offsetting and TSC scaling for L2 */ control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c index 85d3f4cc76f3..a130759f39a1 100644 --- a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c +++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c @@ -28,8 +28,6 @@ #define TEST_VMCB_L2_GPA TEST_VMCB_L1_GPA(0) -#define L2_GUEST_STACK_SIZE 64 - static void l2_guest_code_vmsave(void) { asm volatile("vmsave %0" : : "a"(TEST_VMCB_L2_GPA) : "memory"); @@ -70,10 +68,8 @@ static void l2_guest_code_vmcb1(void) static void l1_guest_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - /* Each test case initializes the guest RIP below */ - generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, NULL); /* Set VMSAVE/VMLOAD intercepts and make sure they work with.. */ svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) | diff --git a/tools/testing/selftests/kvm/x86/smm_test.c b/tools/testing/selftests/kvm/x86/smm_test.c index 740051167dbd..e2542f4ced60 100644 --- a/tools/testing/selftests/kvm/x86/smm_test.c +++ b/tools/testing/selftests/kvm/x86/smm_test.c @@ -63,8 +63,6 @@ static void l2_guest_code(void) static void guest_code(void *arg) { - #define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u64 apicbase = rdmsr(MSR_IA32_APICBASE); struct svm_test_data *svm = arg; struct vmx_pages *vmx_pages = arg; @@ -81,13 +79,11 @@ static void guest_code(void *arg) if (arg) { if (this_cpu_has(X86_FEATURE_SVM)) { - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); } else { GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); } sync_with_host(5); diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 409c6cc9f921..4a1056a6cb8d 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -19,8 +19,6 @@ #include "vmx.h" #include "svm_util.h" -#define L2_GUEST_STACK_SIZE 256 - void svm_l2_guest_code(void) { GUEST_SYNC(4); @@ -35,13 +33,11 @@ void svm_l2_guest_code(void) static void svm_l1_guest_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; GUEST_ASSERT(svm->vmcb_gpa); /* Prepare for L2 execution. */ - generic_svm_setup(svm, svm_l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, svm_l2_guest_code); vmcb->control.int_ctl |= (V_GIF_ENABLE_MASK | V_GIF_MASK); @@ -78,8 +74,6 @@ void vmx_l2_guest_code(void) static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_SYNC(3); @@ -89,8 +83,7 @@ static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(4); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); - prepare_vmcs(vmx_pages, vmx_l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, vmx_l2_guest_code); GUEST_SYNC(5); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); diff --git a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c index d3cc5e4f7883..7b1f4a4818bd 100644 --- a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c @@ -54,15 +54,12 @@ static void l2_guest_code(struct svm_test_data *svm) static void l1_guest_code(struct svm_test_data *svm) { - #define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; x2apic_enable(); /* Prepare for L2 execution. */ - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); /* No virtual interrupt masking */ vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; diff --git a/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c index 7fbfaa054c95..77c6ce9f4507 100644 --- a/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c +++ b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c @@ -9,8 +9,6 @@ #include "svm_util.h" -#define L2_GUEST_STACK_SIZE 64 - #define DO_BRANCH() do { asm volatile("jmp 1f\n 1: nop"); } while (0) struct lbr_branch { @@ -55,7 +53,6 @@ static void l2_guest_code(struct svm_test_data *svm) static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; struct lbr_branch l1_branch; @@ -65,8 +62,7 @@ static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv) CHECK_BRANCH_MSRS(&l1_branch); /* Run L2, which will also do the same */ - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); if (nested_lbrv) vmcb->control.misc_ctl2 = SVM_MISC2_ENABLE_V_LBR; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c index 6a89eaffc657..6bc301207cbc 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c @@ -8,8 +8,6 @@ #include "kselftest.h" -#define L2_GUEST_STACK_SIZE 64 - static void l2_guest_code(void) { unsigned long efer = rdmsr(MSR_EFER); @@ -24,10 +22,7 @@ static void l2_guest_code(void) static void l1_guest_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); run_guest(svm->vmcb, svm->vmcb_gpa); /* Unreachable, L1 should be shutdown */ diff --git a/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c b/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c index 92da8ff34da1..14ec9d6ad195 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c @@ -25,8 +25,6 @@ #include "processor.h" #include "svm_util.h" -#define L2_GUEST_STACK_SIZE 256 - #define PAT_DEFAULT 0x0007040600070406ULL #define L1_PAT_VALUE 0x0007040600070404ULL /* Change PA0 to WT */ #define L2_VMCB12_PAT 0x0606060606060606ULL /* All WB */ @@ -59,14 +57,13 @@ static void l2_guest_code(void) static void l1_guest_code(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; int i; wrmsr(MSR_IA32_CR_PAT, L1_PAT_VALUE); GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L1_PAT_VALUE); - generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); vmcb->save.g_pat = L2_VMCB12_PAT; vmcb->control.intercept &= ~(1ULL << INTERCEPT_MSR_PROT); @@ -94,11 +91,10 @@ static void l1_guest_code(struct svm_test_data *svm) static void l1_guest_code_invalid_gpat(struct svm_test_data *svm) { - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; /* VMRUN should fail without running L2 */ - generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, NULL); vmcb->save.g_pat = INVALID_PAT_VALUE; run_guest(vmcb, svm->vmcb_gpa); diff --git a/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c index c6ea3d609a62..2a4a216954bb 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c @@ -19,12 +19,9 @@ static void l2_guest_code(struct svm_test_data *svm) static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt) { - #define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN)); diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index f72f11d4c4f8..0b640d09d194 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -78,17 +78,13 @@ static void l2_guest_code_nmi(void) static void l1_guest_code(struct svm_test_data *svm, u64 is_nmi, u64 idt_alt) { - #define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; if (is_nmi) x2apic_enable(); /* Prepare for L2 execution. */ - generic_svm_setup(svm, - is_nmi ? l2_guest_code_nmi : l2_guest_code_int, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, is_nmi ? l2_guest_code_nmi : l2_guest_code_int); vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR); vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT); diff --git a/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c index a4935ce2fb99..b3f45035745f 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c @@ -9,14 +9,9 @@ #include "kvm_test_harness.h" #include "test_util.h" - -#define L2_GUEST_STACK_SIZE 64 - #define SYNC_GP 101 #define SYNC_L2_STARTED 102 -static unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - static void guest_gp_handler(struct ex_regs *regs) { GUEST_SYNC(SYNC_GP); @@ -30,28 +25,28 @@ static void l2_code(void) static void l1_vmrun(struct svm_test_data *svm, gpa_t gpa) { - generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_code); asm volatile ("vmrun %[gpa]" : : [gpa] "a" (gpa) : "memory"); } static void l1_vmload(struct svm_test_data *svm, gpa_t gpa) { - generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_code); asm volatile ("vmload %[gpa]" : : [gpa] "a" (gpa) : "memory"); } static void l1_vmsave(struct svm_test_data *svm, gpa_t gpa) { - generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_code); asm volatile ("vmsave %[gpa]" : : [gpa] "a" (gpa) : "memory"); } static void l1_vmexit(struct svm_test_data *svm, gpa_t gpa) { - generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_code); run_guest(svm->vmcb, svm->vmcb_gpa); GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); diff --git a/tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c b/tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c new file mode 100644 index 000000000000..c5b5cd788d93 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM nested SVM PMU Host-Only/Guest-Only test + * + * Copyright (C) 2026, Google LLC. + * + * Test that KVM correctly virtualizes the AMD PMU Host-Only (bit 41) and + * Guest-Only (bit 40) event selector bits across all SVM state + * transitions. + * + * Programs 4 PMCs simultaneously with all combinations of Host-Only and + * Guest-Only bits, then verifies correct counting behavior with different + * combinations of EFER.SVME and host/guest mode -- as well as event filtering. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "pmu.h" + +#define EVENTSEL_RETIRED_INSNS (ARCH_PERFMON_EVENTSEL_OS | \ + ARCH_PERFMON_EVENTSEL_USR | \ + ARCH_PERFMON_EVENTSEL_ENABLE | \ + AMD_ZEN_INSTRUCTIONS_RETIRED) + +/* PMC configurations: index corresponds to Host-Only | Guest-Only bits */ +#define PMC_NONE 0 /* Neither bit set */ +#define PMC_G 1 /* Guest-Only bit set */ +#define PMC_H 2 /* Host-Only bit set */ +#define PMC_HG 3 /* Both bits set */ +#define NR_PMCS 4 + +#define LOOP_INSNS 1000 + +static __always_inline void run_instruction_loop(void) +{ + unsigned int i; + + for (i = 0; i < LOOP_INSNS; i++) + __asm__ __volatile__("nop"); +} + +static __always_inline void read_counters(uint64_t *counts) +{ + int i; + + for (i = 0; i < NR_PMCS; i++) + counts[i] = rdmsr(MSR_F15H_PERF_CTR + 2 * i); +} + +static __always_inline void run_and_measure(uint64_t *deltas) +{ + uint64_t before[NR_PMCS], after[NR_PMCS]; + int i; + + read_counters(before); + run_instruction_loop(); + read_counters(after); + + for (i = 0; i < NR_PMCS; i++) + deltas[i] = after[i] - before[i]; +} + +static void assert_pmc_counts(uint64_t *deltas, unsigned int expected_counting) +{ + int i; + + for (i = 0; i < NR_PMCS; i++) { + if (expected_counting & BIT(i)) + GUEST_ASSERT_NE(deltas[i], 0); + else + GUEST_ASSERT_EQ(deltas[i], 0); + } +} + +static uint64_t l2_deltas[NR_PMCS]; + +static void l2_guest_code(void) +{ + run_and_measure(l2_deltas); + vmmcall(); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + struct vmcb *vmcb = svm->vmcb; + uint64_t deltas[NR_PMCS]; + uint64_t eventsel; + int i; + + /* Program 4 PMCs with all combinations of Host-Only/Guest-Only bits */ + for (i = 0; i < NR_PMCS; i++) { + eventsel = EVENTSEL_RETIRED_INSNS; + if (i & PMC_G) + eventsel |= AMD64_EVENTSEL_GUESTONLY; + if (i & PMC_H) + eventsel |= AMD64_EVENTSEL_HOSTONLY; + wrmsr(MSR_F15H_PERF_CTL + 2 * i, eventsel); + wrmsr(MSR_F15H_PERF_CTR + 2 * i, 0); + } + + /* Step 1: SVME=0 - Only the counter with neither bits set counts */ + wrmsr(MSR_EFER, rdmsr(MSR_EFER) & ~EFER_SVME); + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE)); + + /* Step 2: Set SVME=1 - In L1 "host mode"; Guest-Only stops */ + wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SVME); + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG)); + + /* Step 3: VMRUN to L2 - In "guest mode"; Host-Only stops */ + generic_svm_setup(svm, l2_guest_code); + vmcb->control.intercept &= ~(1ULL << INTERCEPT_MSR_PROT); + + run_guest(vmcb, svm->vmcb_gpa); + + GUEST_ASSERT_EQ(vmcb->control.exit_code, SVM_EXIT_VMMCALL); + assert_pmc_counts(l2_deltas, BIT(PMC_NONE) | BIT(PMC_G) | BIT(PMC_HG)); + + /* Step 4: After VMEXIT to L1 - Back in "host mode"; Guest-Only stops */ + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG)); + + /* Step 5: Set KVM_PMU_EVENT_DENY - all counters stop */ + GUEST_SYNC(KVM_PMU_EVENT_DENY); + run_and_measure(deltas); + assert_pmc_counts(deltas, 0); + + /* Step 6: Set KVM_PMU_EVENT_ALLOW - back to all except Guest-only */ + GUEST_SYNC(KVM_PMU_EVENT_ALLOW); + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG)); + + /* Step 7: Clear Host-Only for PMC_HG - counter stops in "host mode" */ + eventsel = rdmsr(MSR_F15H_PERF_CTL + 2 * PMC_HG); + wrmsr(MSR_F15H_PERF_CTL + 2 * PMC_HG, eventsel & ~AMD64_EVENTSEL_HOSTONLY); + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H)); + + /* Step 8: Restore Host-Only for PMC_HG - counter counts again */ + wrmsr(MSR_F15H_PERF_CTL + 2 * PMC_HG, eventsel); + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG)); + + /* Step 9: Clear SVME - Only the counter with neither bits set counts */ + wrmsr(MSR_EFER, rdmsr(MSR_EFER) & ~EFER_SVME); + run_and_measure(deltas); + assert_pmc_counts(deltas, BIT(PMC_NONE)); + + GUEST_DONE(); +} + +static struct kvm_pmu_event_filter *alloc_event_filter(u64 event) +{ + struct kvm_pmu_event_filter *filter; + + filter = malloc(sizeof(*filter) + sizeof(event)); + TEST_ASSERT(filter != NULL, "Filter allocation failed"); + + memset(filter, 0, sizeof(*filter)); + memcpy(filter->events, &event, sizeof(event)); + filter->nevents = 1; + filter->action = KVM_PMU_EVENT_ALLOW; + + return filter; +} + +int main(int argc, char *argv[]) +{ + struct kvm_pmu_event_filter *filter; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + gva_t svm_gva; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_is_pmu_enabled()); + TEST_REQUIRE(kvm_is_mediated_pmu_enabled()); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vcpu, 1, svm_gva); + + filter = alloc_event_filter(AMD_ZEN_INSTRUCTIONS_RETIRED); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + goto done; + case UCALL_DONE: + goto done; + case UCALL_SYNC: + filter->action = uc.args[1]; + vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, filter); + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + goto done; + } + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86/svm_vmcall_test.c index b1887242f3b8..7c57fb7e6422 100644 --- a/tools/testing/selftests/kvm/x86/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86/svm_vmcall_test.c @@ -19,13 +19,10 @@ static void l2_guest_code(struct svm_test_data *svm) static void l1_guest_code(struct svm_test_data *svm) { - #define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; struct vmcb *vmcb = svm->vmcb; /* Prepare for L2 execution. */ - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); run_guest(vmcb, svm->vmcb_gpa); diff --git a/tools/testing/selftests/kvm/x86/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86/triple_fault_event_test.c index f1c488e0d497..0d83516f4bd0 100644 --- a/tools/testing/selftests/kvm/x86/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86/triple_fault_event_test.c @@ -21,9 +21,6 @@ static void l2_guest_code(void) : : [port] "d" (ARBITRARY_IO_PORT) : "rax"); } -#define L2_GUEST_STACK_SIZE 64 -unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - void l1_guest_code_vmx(struct vmx_pages *vmx) { @@ -31,8 +28,7 @@ void l1_guest_code_vmx(struct vmx_pages *vmx) GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx, l2_guest_code); GUEST_ASSERT(!vmlaunch()); /* L2 should triple fault after a triple fault event injected. */ @@ -44,8 +40,7 @@ void l1_guest_code_svm(struct svm_test_data *svm) { struct vmcb *vmcb = svm->vmcb; - generic_svm_setup(svm, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + generic_svm_setup(svm, l2_guest_code); /* don't intercept shutdown to test the case of SVM allowing to do so */ vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN)); diff --git a/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c index 1720113eae79..463f73aa9159 100644 --- a/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c @@ -36,16 +36,13 @@ static void l2_guest_code(void) static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u32 control; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); diff --git a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c index 80a4fd1e5bbb..f9b88a6f6113 100644 --- a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c @@ -31,15 +31,13 @@ static void l2_guest_code(void) static void l1_guest_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u32 control; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); control |= CPU_BASED_USE_MSR_BITMAPS; vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); diff --git a/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c index a2eaceed9ad5..6d88c54f69fa 100644 --- a/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c +++ b/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c @@ -25,15 +25,11 @@ static void l2_guest_code(void) static void l1_guest_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); /* * L2 must be run without unrestricted guest, verify that the selftests diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c index f13dee317383..75073efa926d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -27,8 +27,6 @@ static void l2_guest_code(void) static void l1_guest_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u64 guest_cr4; gpa_t pml5_pa, pml4_pa; u64 *pml5; @@ -42,8 +40,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); GUEST_ASSERT(load_vmcs(vmx_pages)); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); /* * Set up L2 with a 4-level page table by pointing its CR3 to diff --git a/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c index 1b7b6ba23de7..eb8021c33cd4 100644 --- a/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c @@ -66,8 +66,6 @@ void l2_guest_code(void) void l1_guest_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; u64 l1_vmx_pt_start; u64 l1_vmx_pt_finish; u64 l1_tsc_deadline, l2_tsc_deadline; @@ -77,8 +75,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(load_vmcs(vmx_pages)); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + prepare_vmcs(vmx_pages, l2_guest_code); /* * Check for Preemption timer support |
