summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/kvm_host.h655
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c1
-rw-r--r--arch/x86/kvm/fpu.h26
-rw-r--r--arch/x86/kvm/hyperv.c7
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/ioapic.c1
-rw-r--r--arch/x86/kvm/ioapic.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.h121
-rw-r--r--arch/x86/kvm/mmu/mmu.c541
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h66
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h88
-rw-r--r--arch/x86/kvm/mmu/spte.c4
-rw-r--r--arch/x86/kvm/mmu/spte.h69
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c3
-rw-r--r--arch/x86/kvm/msrs.c2745
-rw-r--r--arch/x86/kvm/msrs.h156
-rw-r--r--arch/x86/kvm/mtrr.c2
-rw-r--r--arch/x86/kvm/pmu.c12
-rw-r--r--arch/x86/kvm/regs.c874
-rw-r--r--arch/x86/kvm/regs.h60
-rw-r--r--arch/x86/kvm/svm/nested.c14
-rw-r--r--arch/x86/kvm/svm/svm.c6
-rw-r--r--arch/x86/kvm/tss.h7
-rw-r--r--arch/x86/kvm/vmx/nested.c16
-rw-r--r--arch/x86/kvm/vmx/vmx.c26
-rw-r--r--arch/x86/kvm/x86.c3762
-rw-r--r--arch/x86/kvm/x86.h462
-rw-r--r--tools/testing/selftests/kvm/Makefile.kvm1
-rw-r--r--tools/testing/selftests/kvm/include/x86/pmu.h6
-rw-r--r--tools/testing/selftests/kvm/include/x86/processor.h10
-rw-r--r--tools/testing/selftests/kvm/include/x86/svm_util.h5
-rw-r--r--tools/testing/selftests/kvm/include/x86/vmx.h4
-rw-r--r--tools/testing/selftests/kvm/lib/x86/memstress.c19
-rw-r--r--tools/testing/selftests/kvm/lib/x86/processor.c45
-rw-r--r--tools/testing/selftests/kvm/lib/x86/svm.c6
-rw-r--r--tools/testing/selftests/kvm/lib/x86/vmx.c6
-rw-r--r--tools/testing/selftests/kvm/x86/aperfmperf_test.c9
-rw-r--r--tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86/hyperv_evmcs.c6
-rw-r--r--tools/testing/selftests/kvm/x86/hyperv_svm_test.c6
-rw-r--r--tools/testing/selftests/kvm/x86/kvm_buslock_test.c9
-rw-r--r--tools/testing/selftests/kvm/x86/nested_close_kvm_test.c12
-rw-r--r--tools/testing/selftests/kvm/x86/nested_dirty_log_test.c8
-rw-r--r--tools/testing/selftests/kvm/x86/nested_emulation_test.c4
-rw-r--r--tools/testing/selftests/kvm/x86/nested_exceptions_test.c9
-rw-r--r--tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c10
-rw-r--r--tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c9
-rw-r--r--tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c10
-rw-r--r--tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c10
-rw-r--r--tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c6
-rw-r--r--tools/testing/selftests/kvm/x86/smm_test.c8
-rw-r--r--tools/testing/selftests/kvm/x86/state_test.c11
-rw-r--r--tools/testing/selftests/kvm/x86/svm_int_ctl_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c6
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c7
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_pat_test.c8
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c6
-rw-r--r--tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c13
-rw-r--r--tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c215
-rw-r--r--tools/testing/selftests/kvm/x86/svm_vmcall_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86/triple_fault_event_test.c9
-rw-r--r--tools/testing/selftests/kvm/x86/vmx_apic_access_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c4
-rw-r--r--tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c6
-rw-r--r--tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c5
71 files changed, 5273 insertions, 5029 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5f6c1ce9673b..b517257a6315 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -78,12 +78,6 @@
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
-#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
- KVM_BUS_LOCK_DETECTION_EXIT)
-
-#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \
- KVM_X86_NOTIFY_VMEXIT_USER)
-
/* x86-specific vcpu->requests bit members */
#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
@@ -161,12 +155,6 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
-#define KVM_MIN_ALLOC_MMU_PAGES 64UL
-#define KVM_MMU_HASH_SHIFT 12
-#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 256
#define KVM_NR_VAR_MTRR 8
@@ -315,6 +303,53 @@ enum x86_intercept_stage;
struct kvm_kernel_irqfd;
struct kvm_kernel_irq_routing_entry;
+struct kvm_x86_msr_filter;
+struct kvm_x86_pmu_event_filter;
+
+struct kvm_caps {
+ /* control of guest tsc rate supported? */
+ bool has_tsc_control;
+ /* maximum supported tsc_khz for guests */
+ u32 max_guest_tsc_khz;
+ /* number of bits of the fractional part of the TSC scaling ratio */
+ u8 tsc_scaling_ratio_frac_bits;
+ /* maximum allowed value of TSC scaling ratio */
+ u64 max_tsc_scaling_ratio;
+ /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */
+ u64 default_tsc_scaling_ratio;
+ /* bus lock detection supported? */
+ bool has_bus_lock_exit;
+ /* notify VM exit supported? */
+ bool has_notify_vmexit;
+ /* bit mask of VM types */
+ u32 supported_vm_types;
+
+ u64 supported_mce_cap;
+ u64 supported_xcr0;
+ u64 supported_xss;
+ u64 supported_perf_cap;
+
+ u64 supported_quirks;
+ u64 inapplicable_quirks;
+};
+extern struct kvm_caps kvm_caps;
+
+struct kvm_host_values {
+ /*
+ * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ * address bits irrespective of features that repurpose legal bits,
+ * e.g. MKTME.
+ */
+ u8 maxphyaddr;
+
+ u64 efer;
+ u64 xcr0;
+ u64 xss;
+ u64 s_cet;
+ u64 arch_capabilities;
+};
+extern struct kvm_host_values kvm_host;
+
/*
* kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
* also includes TDP pages) to determine whether or not a page can be used in
@@ -452,9 +487,24 @@ struct kvm_pio_request {
#define PT64_ROOT_MAX_LEVEL 5
-struct rsvd_bits_validate {
+struct kvm_page_format {
u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
u64 bad_mt_xwr;
+
+ /*
+ * The pkru_mask indicates if protection key checks are needed. It
+ * consists of 16 domains indexed by page fault error code bits [4:1],
+ * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+ * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+ */
+ u32 pkru_mask;
+
+ /*
+ * Bitmap; bit set = permission fault
+ * Array index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u16 permissions[16];
};
struct kvm_mmu_root_info {
@@ -478,43 +528,35 @@ struct kvm_page_fault;
/*
* x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
- * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
+ * and 2-level 32-bit). The kvm_pagewalk structure abstracts the details of the
* current mmu mode.
*/
-struct kvm_mmu {
+struct kvm_pagewalk {
unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
- int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault,
bool from_hardware);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
gpa_t gva_or_gpa, u64 access,
struct x86_exception *exception);
+
+ union kvm_cpu_role cpu_role;
+ struct kvm_page_format fmt;
+};
+
+struct kvm_mmu {
+ int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
int (*sync_spte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, int i);
+ struct kvm_pagewalk *w;
+
struct kvm_mmu_root_info root;
hpa_t mirror_root_hpa;
- union kvm_cpu_role cpu_role;
union kvm_mmu_page_role root_role;
- /*
- * The pkru_mask indicates if protection key checks are needed. It
- * consists of 16 domains indexed by page fault error code bits [4:1],
- * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
- * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
- */
- u32 pkru_mask;
-
struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
- /*
- * Bitmap; bit set = permission fault
- * Byte index: page fault error code [4:1]
- * Bit index: pte permissions in ACC_* format
- */
- u16 permissions[16];
-
u64 *pae_root;
u64 *pml4_root;
u64 *pml5_root;
@@ -524,8 +566,7 @@ struct kvm_mmu {
* bits include not only hardware reserved bits but also
* the bits spte never used.
*/
- struct rsvd_bits_validate shadow_zero_check;
- struct rsvd_bits_validate guest_rsvd_check;
+ struct kvm_page_format fmt;
};
enum pmc_type {
@@ -866,24 +907,14 @@ struct kvm_vcpu_arch {
/* Non-nested MMU for L1 */
struct kvm_mmu root_mmu;
- /* L1 MMU when running nested */
+ /* L1 TDP when running nested */
struct kvm_mmu guest_mmu;
+ struct kvm_pagewalk ngpa_walk;
/*
- * Paging state of an L2 guest (used for nested npt)
- *
- * This context will save all necessary information to walk page tables
- * of an L2 guest. This context is only initialized for page table
- * walking and not for faulting since we never handle l2 page faults on
- * the host.
- */
- struct kvm_mmu nested_mmu;
-
- /*
- * Pointer to the mmu context currently used for
- * gva_to_gpa translations.
+ * Pagewalk context used for gva_to_gpa translations.
*/
- struct kvm_mmu *walk_mmu;
+ struct kvm_pagewalk gva_walk;
u64 pdptrs[4]; /* pae */
@@ -1246,13 +1277,6 @@ struct kvm_hv {
};
#endif
-struct msr_bitmap_range {
- u32 flags;
- u32 nmsrs;
- u32 base;
- unsigned long *bitmap;
-};
-
#ifdef CONFIG_KVM_XEN
/* Xen emulation context */
struct kvm_xen {
@@ -1283,132 +1307,6 @@ enum kvm_suppress_eoi_broadcast_mode {
KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */
};
-struct kvm_x86_msr_filter {
- u8 count;
- bool default_allow:1;
- struct msr_bitmap_range ranges[16];
-};
-
-struct kvm_x86_pmu_event_filter {
- __u32 action;
- __u32 nevents;
- __u32 fixed_counter_bitmap;
- __u32 flags;
- __u32 nr_includes;
- __u32 nr_excludes;
- __u64 *includes;
- __u64 *excludes;
- __u64 events[] __counted_by(nevents);
-};
-
-enum kvm_apicv_inhibit {
-
- /********************************************************************/
- /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
- /********************************************************************/
-
- /*
- * APIC acceleration is disabled by a module parameter
- * and/or not supported in hardware.
- */
- APICV_INHIBIT_REASON_DISABLED,
-
- /*
- * APIC acceleration is inhibited because AutoEOI feature is
- * being used by a HyperV guest.
- */
- APICV_INHIBIT_REASON_HYPERV,
-
- /*
- * APIC acceleration is inhibited because the userspace didn't yet
- * enable the kernel/split irqchip.
- */
- APICV_INHIBIT_REASON_ABSENT,
-
- /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
- * (out of band, debug measure of blocking all interrupts on this vCPU)
- * was enabled, to avoid AVIC/APICv bypassing it.
- */
- APICV_INHIBIT_REASON_BLOCKIRQ,
-
- /*
- * APICv is disabled because not all vCPUs have a 1:1 mapping between
- * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
- */
- APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
-
- /*
- * For simplicity, the APIC acceleration is inhibited
- * first time either APIC ID or APIC base are changed by the guest
- * from their reset values.
- */
- APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
- APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
-
- /******************************************************/
- /* INHIBITs that are relevant only to the AMD's AVIC. */
- /******************************************************/
-
- /*
- * AVIC is inhibited on a vCPU because it runs a nested guest.
- *
- * This is needed because unlike APICv, the peers of this vCPU
- * cannot use the doorbell mechanism to signal interrupts via AVIC when
- * a vCPU runs nested.
- */
- APICV_INHIBIT_REASON_NESTED,
-
- /*
- * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
- * which cannot be injected when the AVIC is enabled, thus AVIC
- * is inhibited while KVM waits for IRQ window.
- */
- APICV_INHIBIT_REASON_IRQWIN,
-
- /*
- * PIT (i8254) 're-inject' mode, relies on EOI intercept,
- * which AVIC doesn't support for edge triggered interrupts.
- */
- APICV_INHIBIT_REASON_PIT_REINJ,
-
- /*
- * AVIC is disabled because SEV doesn't support it.
- */
- APICV_INHIBIT_REASON_SEV,
-
- /*
- * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
- * mapping between logical ID and vCPU.
- */
- APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
-
- /*
- * AVIC is disabled because the vCPU's APIC ID is beyond the max
- * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
- */
- APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
-
- NR_APICV_INHIBIT_REASONS,
-};
-
-#define __APICV_INHIBIT_REASON(reason) \
- { BIT(APICV_INHIBIT_REASON_##reason), #reason }
-
-#define APICV_INHIBIT_REASONS \
- __APICV_INHIBIT_REASON(DISABLED), \
- __APICV_INHIBIT_REASON(HYPERV), \
- __APICV_INHIBIT_REASON(ABSENT), \
- __APICV_INHIBIT_REASON(BLOCKIRQ), \
- __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
- __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
- __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
- __APICV_INHIBIT_REASON(NESTED), \
- __APICV_INHIBIT_REASON(IRQWIN), \
- __APICV_INHIBIT_REASON(PIT_REINJ), \
- __APICV_INHIBIT_REASON(SEV), \
- __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
- __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
-
struct kvm_possible_nx_huge_pages {
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1771,11 +1669,6 @@ struct kvm_lapic_irq {
bool msi_redir_hint;
};
-static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
-{
- return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
-}
-
enum kvm_x86_run_flags {
KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
@@ -2054,7 +1947,6 @@ struct kvm_arch_async_pf {
u64 error_code;
};
-extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern bool __read_mostly enable_ipiv;
@@ -2069,9 +1961,6 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
-void kvm_x86_vendor_exit(void);
-
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -2114,304 +2003,12 @@ enum kvm_intr_type {
((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
(!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))
-void __init kvm_mmu_x86_module_init(void);
-int kvm_mmu_vendor_module_init(void);
-void kvm_mmu_vendor_module_exit(void);
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
-int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_init_vm(struct kvm *kvm);
-void kvm_mmu_uninit_vm(struct kvm *kvm);
-
-void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
- struct kvm_memory_slot *slot);
-
-void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
-void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- const struct kvm_memory_slot *memslot,
- int start_level);
-void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
- const struct kvm_memory_slot *memslot,
- int target_level);
-void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
- const struct kvm_memory_slot *memslot,
- u64 start, u64 end,
- int target_level);
-void kvm_mmu_recover_huge_pages(struct kvm *kvm,
- const struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- const struct kvm_memory_slot *memslot);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
-
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
-
-extern bool tdp_enabled;
-
-/*
- * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
- * userspace I/O) to indicate that the emulation context
- * should be reused as is, i.e. skip initialization of
- * emulation context, instruction fetch and decode.
- *
- * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
- * Indicates that only select instructions (tagged with
- * EmulateOnUD) should be emulated (to minimize the emulator
- * attack surface). See also EMULTYPE_TRAP_UD_FORCED.
- *
- * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
- * decode the instruction length. For use *only* by
- * kvm_x86_ops.skip_emulated_instruction() implementations if
- * EMULTYPE_COMPLETE_USER_EXIT is not set.
- *
- * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
- * retry native execution under certain conditions,
- * Can only be set in conjunction with EMULTYPE_PF.
- *
- * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
- * triggered by KVM's magic "force emulation" prefix,
- * which is opt in via module param (off by default).
- * Bypasses EmulateOnUD restriction despite emulating
- * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
- * Used to test the full emulator from userspace.
- *
- * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
- * backdoor emulation, which is opt in via module param.
- * VMware backdoor emulation handles select instructions
- * and reinjects the #GP for all other cases.
- *
- * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case
- * the CR2/GPA value pass on the stack is valid.
- *
- * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
- * state and inject single-step #DBs after skipping
- * an instruction (after completing userspace I/O).
- *
- * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
- * is attempting to write a gfn that contains one or
- * more of the PTEs used to translate the write itself,
- * and the owning page table is being shadowed by KVM.
- * If emulation of the faulting instruction fails and
- * this flag is set, KVM will exit to userspace instead
- * of retrying emulation as KVM cannot make forward
- * progress.
- *
- * If emulation fails for a write to guest page tables,
- * KVM unprotects (zaps) the shadow page for the target
- * gfn and resumes the guest to retry the non-emulatable
- * instruction (on hardware). Unprotecting the gfn
- * doesn't allow forward progress for a self-changing
- * access because doing so also zaps the translation for
- * the gfn, i.e. retrying the instruction will hit a
- * !PRESENT fault, which results in a new shadow page
- * and sends KVM back to square one.
- *
- * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip
- * an instruction if it could generate a given software
- * interrupt, which must be encoded via
- * EMULTYPE_SET_SOFT_INT_VECTOR().
- */
-#define EMULTYPE_NO_DECODE (1 << 0)
-#define EMULTYPE_TRAP_UD (1 << 1)
-#define EMULTYPE_SKIP (1 << 2)
-#define EMULTYPE_ALLOW_RETRY_PF (1 << 3)
-#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
-#define EMULTYPE_VMWARE_GP (1 << 5)
-#define EMULTYPE_PF (1 << 6)
-#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
-#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
-#define EMULTYPE_SKIP_SOFT_INT (1 << 9)
-
-#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16)
-#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff)
-
-static inline bool kvm_can_emulate_event_vectoring(int emul_type)
-{
- return !(emul_type & EMULTYPE_PF);
-}
-
-int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
-int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
- void *insn, int insn_len);
-void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
- u64 *data, u8 ndata);
-void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
-
-void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa);
-void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason);
-
-void kvm_enable_efer_bits(u64);
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
-int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
-int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
-int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
-int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
-int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
-int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
-int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
-int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
-int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
-int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
-int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
-int kvm_emulate_invd(struct kvm_vcpu *vcpu);
-int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
-int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
-int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
-
-int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
-int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
-int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
-
-void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
- int reason, bool has_error_code, u32 error_code);
-
-void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
-void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
-int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr);
-unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
-int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
-
-unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
-
-void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
-void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
- bool has_error_code, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault,
- bool from_hardware);
-void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault,
- bool from_hardware);
-
-static inline void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- __kvm_inject_emulated_page_fault(vcpu, fault, false);
-}
-
-bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
-
-static inline int __kvm_irq_line_state(unsigned long *irq_state,
- int irq_source_id, int level)
-{
- /* Logical OR for level trig interrupt */
- if (level)
- __set_bit(irq_source_id, irq_state);
- else
- __clear_bit(irq_source_id, irq_state);
-
- return !!(*irq_state);
-}
-
-void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
-
-void kvm_update_dr7(struct kvm_vcpu *vcpu);
-
-bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- bool always_retry);
-
-static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
- gpa_t cr2_or_gpa)
-{
- return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
-}
-
-void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
- ulong roots_to_free);
-void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
-
-bool kvm_apicv_activated(struct kvm *kvm);
-bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
-void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
-void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
- enum kvm_apicv_inhibit reason, bool set);
-void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
- enum kvm_apicv_inhibit reason, bool set);
-
-static inline void kvm_set_apicv_inhibit(struct kvm *kvm,
- enum kvm_apicv_inhibit reason)
-{
- kvm_set_or_clear_apicv_inhibit(kvm, reason, true);
-}
-
-static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
- enum kvm_apicv_inhibit reason)
-{
- kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
-}
-
-void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc);
-
-static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm)
-{
- kvm_inc_or_dec_irq_window_inhibit(kvm, true);
-}
-
-static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm)
-{
- kvm_inc_or_dec_irq_window_inhibit(kvm, false);
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
- void *insn, int insn_len);
-void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- u64 addr, unsigned long roots);
-void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
-
-void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
- int tdp_max_root_level, int tdp_huge_page_level);
-
-
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#endif
#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
-static inline u16 kvm_read_ldt(void)
-{
- u16 ldt;
- asm("sldt %0" : "=g"(ldt));
- return ldt;
-}
-
-static inline void kvm_load_ldt(u16 sel)
-{
- asm("lldt %0" : : "rm"(sel));
-}
-
#ifdef CONFIG_X86_64
static inline unsigned long read_msr(unsigned long msr)
{
@@ -2422,18 +2019,6 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
-static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
-{
- kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-}
-
-#define TSS_IOPB_BASE_OFFSET 0x66
-#define TSS_BASE_SIZE 0x68
-#define TSS_IOPB_SIZE (65536 / 8)
-#define TSS_REDIRECTION_SIZE (256 / 8)
-#define RMODE_TSS_SIZE \
- (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-
enum {
TASK_SWITCH_CALL = 0,
TASK_SWITCH_IRET = 1,
@@ -2456,40 +2041,6 @@ enum {
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
#endif
-int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_cpu_has_extint(struct kvm_vcpu *v);
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
-int kvm_cpu_get_extint(struct kvm_vcpu *v);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-
-int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
- unsigned long ipi_bitmap_high, u32 min,
- unsigned long icr, int op_64_bit);
-
-int kvm_add_user_return_msr(u32 msr);
-int kvm_find_user_return_msr(u32 msr);
-int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
-u64 kvm_get_user_return_msr(unsigned int slot);
-
-static inline bool kvm_is_supported_user_return_msr(u32 msr)
-{
- return kvm_find_user_return_msr(msr) >= 0;
-}
-
-u64 kvm_scale_tsc(u64 tsc, u64 ratio);
-u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
-u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
-u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
-
-unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
-
-void kvm_make_scan_ioapic_request(struct kvm *kvm);
-void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
- unsigned long *vcpu_bitmap);
-
bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
@@ -2498,22 +2049,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
-extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
-
-int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
-int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
-
-void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
- u32 size);
-bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
-bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
-
-static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
-{
- /* We can only post Fixed and LowPrio IRQs */
- return (irq->delivery_mode == APIC_DM_FIXED ||
- irq->delivery_mode == APIC_DM_LOWEST);
-}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
@@ -2525,36 +2060,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
kvm_x86_call(vcpu_unblocking)(vcpu);
}
-int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
-
-#define KVM_CLOCK_VALID_FLAGS \
- (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
-
-#define KVM_X86_VALID_QUIRKS \
- (KVM_X86_QUIRK_LINT0_REENABLED | \
- KVM_X86_QUIRK_CD_NW_CLEARED | \
- KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
- KVM_X86_QUIRK_OUT_7E_INC_RIP | \
- KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
- KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
- KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
- KVM_X86_QUIRK_SLOT_ZAP_ALL | \
- KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
- KVM_X86_QUIRK_IGNORE_GUEST_PAT | \
- KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \
- KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT)
-
-#define KVM_X86_CONDITIONAL_QUIRKS \
- (KVM_X86_QUIRK_CD_NW_CLEARED | \
- KVM_X86_QUIRK_IGNORE_GUEST_PAT)
-
-/*
- * KVM previously used a u32 field in kvm_run to indicate the hypercall was
- * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
- * remaining 31 lower bits must be 0 to preserve ABI.
- */
-#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
-
static inline bool kvm_arch_has_irq_bypass(void)
{
return enable_device_posted_irqs;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 77337c37324b..0474604ab8a1 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,8 +5,8 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o msrs.o pmu.o regs.o \
+ mtrr.o debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 591d2294acd7..2698fa42cd97 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,6 +28,7 @@
#include "trace.h"
#include "pmu.h"
#include "xen.h"
+#include "x86.h"
/*
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
index f898781b6a06..6b7b628f530d 100644
--- a/arch/x86/kvm/fpu.h
+++ b/arch/x86/kvm/fpu.h
@@ -3,8 +3,34 @@
#ifndef __KVM_FPU_H_
#define __KVM_FPU_H_
+#include <linux/kvm_host.h>
+
+#include <trace/events/kvm.h>
+
#include <asm/fpu/api.h>
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static inline void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
+ /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
+ trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static inline void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
+}
+
typedef u32 __attribute__((vector_size(16))) sse128_t;
#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index fd4eb1e561f7..1ee0d23f8949 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2045,10 +2045,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
* flush). Translate the address here so the memory can be uniformly
* read with kvm_read_guest().
*/
- if (!hc->fast && mmu_is_nested(vcpu)) {
- hc->ingpa = kvm_x86_ops.nested_ops->translate_nested_gpa(
- vcpu, hc->ingpa,
- PFERR_GUEST_FINAL_MASK, NULL, 0);
+ if (!hc->fast) {
+ hc->ingpa = kvm_translate_gpa(vcpu, &vcpu->arch.gva_walk, hc->ingpa,
+ PFERR_GUEST_FINAL_MASK, NULL, 0);
if (unlikely(hc->ingpa == INVALID_GPA))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 65e89ed65349..1c8f7aaab063 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -22,7 +22,8 @@
#define __ARCH_X86_KVM_HYPERV_H__
#include <linux/kvm_host.h>
-#include "x86.h"
+
+#include "regs.h"
#ifdef CONFIG_KVM_HYPERV
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 757667fb2bfa..0d59b9c758c2 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -33,6 +33,7 @@
#include "lapic.h"
#include "irq.h"
#include "trace.h"
+#include "x86.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3dadae093690..81b576513116 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -113,6 +113,18 @@ void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
+
+static inline int __kvm_irq_line_state(unsigned long *irq_state,
+ int irq_source_id, int level)
+{
+ /* Logical OR for level trig interrupt */
+ if (level)
+ __set_bit(irq_source_id, irq_state);
+ else
+ __clear_bit(irq_source_id, irq_state);
+
+ return !!(*irq_state);
+}
#endif /* CONFIG_KVM_IOAPIC */
static inline int ioapic_in_kernel(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 8c62c6d4d5c1..727245a6ab34 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -423,6 +423,13 @@ void kvm_arch_irq_routing_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+static bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
+{
+ /* We can only post Fixed and LowPrio IRQs */
+ return (irq->delivery_mode == APIC_DM_FIXED ||
+ irq->delivery_mode == APIC_DM_LOWEST);
+}
+
static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *entry)
{
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34f4a78a7a01..1a84ea31e7fd 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -112,6 +112,12 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
return mode != KVM_IRQCHIP_NONE;
}
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
+int kvm_cpu_get_extint(struct kvm_vcpu *v);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 71970213dc1f..58dbb94f980d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -131,6 +131,9 @@ static inline int kvm_irq_delivery_to_apic(struct kvm *kvm,
}
void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit);
int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated);
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
@@ -237,6 +240,11 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
+static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+{
+ return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+}
+
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e1bb663ebbd5..c9f628b97dae 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -4,10 +4,23 @@
#include <linux/kvm_host.h>
#include "regs.h"
-#include "x86.h"
#include "cpuid.h"
+extern bool tdp_enabled;
+#ifdef CONFIG_X86_64
+extern bool tdp_mmu_enabled;
+#else
+#define tdp_mmu_enabled false
+#endif
extern bool __read_mostly enable_mmio_caching;
+extern bool eager_page_split;
+
+#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
+#define KVM_MMU_HASH_SHIFT 12
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
#define PT_WRITABLE_SHIFT 1
#define PT_USER_SHIFT 2
@@ -90,6 +103,38 @@ static inline bool mmu_has_mbec(struct kvm_mmu *mmu)
u8 kvm_mmu_get_max_tdp_level(void);
+void __init kvm_mmu_x86_module_init(void);
+int kvm_mmu_vendor_module_init(void);
+void kvm_mmu_vendor_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int start_level);
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level);
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
@@ -101,11 +146,24 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
int huge_page_level, bool accessed_dirty,
bool mbec, gpa_t new_eptp);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len);
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
+ u64 addr, unsigned long roots);
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
+
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level);
+
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu);
+ struct kvm_pagewalk *pw);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
@@ -115,6 +173,25 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes);
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry);
+
+static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa)
+{
+ return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
+}
+
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
+ ulong roots_to_free);
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
@@ -169,21 +246,21 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
}
static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu)
+ struct kvm_pagewalk *w)
{
/*
* When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
- * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * @w's snapshot of CR0.WP and thus all related paging metadata may
* be stale. Refresh CR0.WP and the metadata on-demand when checking
* for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
- * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
- * need to refresh nested_mmu, a.k.a. the walker used to translate L2
- * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM will
+ * still refresh gva_walk, so as to honor L2's CR0.WP when translating
+ * L2 GVAs to GPAs.
*/
- if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ if (!tdp_enabled || w == &vcpu->arch.ngpa_walk)
return;
- __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+ __kvm_mmu_refresh_passthrough_bits(vcpu, w);
}
/*
@@ -194,7 +271,7 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
* Return zero if the access does not fault; return the page fault error code
* if the access faults.
*/
-static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
unsigned pte_access, unsigned pte_pkey,
u64 access)
{
@@ -217,15 +294,16 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
+ struct kvm_page_format *fmt = &w->fmt;
u32 errcode = PFERR_PRESENT_MASK;
bool fault;
- kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+ kvm_mmu_refresh_passthrough_bits(vcpu, w);
- fault = (mmu->permissions[index] >> pte_access) & 1;
+ fault = (fmt->permissions[index] >> pte_access) & 1;
WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK));
- if (unlikely(mmu->pkru_mask)) {
+ if (unlikely(fmt->pkru_mask)) {
u32 pkru_bits, offset;
/*
@@ -239,7 +317,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
- pkru_bits &= mmu->pkru_mask >> offset;
+ pkru_bits &= fmt->pkru_mask >> offset;
errcode |= -pkru_bits & PFERR_PK_MASK;
fault |= (pkru_bits != 0);
}
@@ -261,12 +339,6 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
return smp_load_acquire(&kvm->arch.shadow_root_allocated);
}
-#ifdef CONFIG_X86_64
-extern bool tdp_mmu_enabled;
-#else
-#define tdp_mmu_enabled false
-#endif
-
int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
@@ -300,13 +372,18 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
atomic64_add(count, &kvm->stat.pages[level - 1]);
}
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
+}
+
static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu,
+ struct kvm_pagewalk *w,
gpa_t gpa, u64 access,
struct x86_exception *exception,
u64 pte_access)
{
- if (mmu != &vcpu->arch.nested_mmu)
+ if (!mmu_is_nested(vcpu) || w == &vcpu->arch.ngpa_walk)
return gpa;
return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access,
exception,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 234d0a95abf5..dda1fd266d33 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -115,6 +115,9 @@ module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled);
#endif
+bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
@@ -225,9 +228,9 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
* and the vCPU may be incorrect/irrelevant.
*/
#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
-static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_pagewalk *w) \
{ \
- return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
+ return !!(w->cpu_role. base_or_ext . reg##_##name); \
}
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
@@ -238,19 +241,19 @@ BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
-static inline bool has_pferr_fetch(struct kvm_mmu *mmu)
+static inline bool has_pferr_fetch(struct kvm_pagewalk *w)
{
- return mmu->cpu_role.ext.has_pferr_fetch;
+ return w->cpu_role.ext.has_pferr_fetch;
}
-static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+static inline bool is_cr0_pg(struct kvm_pagewalk *w)
{
- return mmu->cpu_role.base.level > 0;
+ return w->cpu_role.base.level > 0;
}
-static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+static inline bool is_cr4_pae(struct kvm_pagewalk *w)
{
- return !mmu->cpu_role.base.has_4_byte_gpte;
+ return !w->cpu_role.base.has_4_byte_gpte;
}
static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
@@ -270,12 +273,12 @@ static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
}
static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu)
+ struct kvm_pagewalk *w)
{
- if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && w->get_guest_pgd == get_guest_cr3)
return kvm_read_cr3(vcpu);
- return mmu->get_guest_pgd(vcpu);
+ return w->get_guest_pgd(vcpu);
}
static inline bool kvm_available_flush_remote_tlbs_range(void)
@@ -2476,12 +2479,14 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
struct kvm_vcpu *vcpu, hpa_t root,
u64 addr)
{
+ struct kvm_pagewalk *w = vcpu->arch.mmu->w;
+
iterator->addr = addr;
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->root_role.level;
if (iterator->level >= PT64_ROOT_4LEVEL &&
- vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+ w->cpu_role.base.level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->root_role.direct)
iterator->level = PT32E_ROOT_LEVEL;
@@ -3690,6 +3695,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
*/
static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
+ struct kvm_mmu *mmu;
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
u64 spte;
@@ -3699,6 +3705,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (!page_fault_can_be_fast(vcpu->kvm, fault))
return ret;
+ mmu = vcpu->arch.mmu;
walk_shadow_page_lockless_begin(vcpu);
do {
@@ -3734,7 +3741,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* Need not check the access of upper level table entries since
* they are always ACC_ALL.
*/
- if (is_access_allowed(fault, spte)) {
+ if (!spte_permission_fault(mmu, spte, fault)) {
ret = RET_PF_SPURIOUS;
break;
}
@@ -3757,7 +3764,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* that were write-protected for dirty-logging or access
* tracking are handled here. Don't bother checking if the
* SPTE is writable to prioritize running with A/D bits enabled.
- * The is_access_allowed() check above handles the common case
+ * The spte_permission_fault() check above handles the common case
* of the fault being spurious, and the SPTE is known to be
* shadow-present, i.e. except for access tracking restoration
* making the new SPTE writable, the check is wasteful.
@@ -3782,7 +3789,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
/* Verify that the fault can be handled in the fast path */
if (new_spte == spte ||
- !is_access_allowed(fault, new_spte))
+ spte_permission_fault(mmu, new_spte, fault))
break;
/*
@@ -4088,12 +4095,13 @@ out_unlock:
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_pagewalk *w = mmu->w;
u64 pdptrs[4], pm_mask;
gfn_t root_gfn, root_pgd;
int quadrant, i, r;
hpa_t root;
- root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, w);
root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
@@ -4105,9 +4113,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* On SVM, reading PDPTRs might access guest memory, which might fault
* and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
*/
- if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ if (w->cpu_role.base.level == PT32E_ROOT_LEVEL) {
for (i = 0; i < 4; ++i) {
- pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ pdptrs[i] = w->get_pdptr(vcpu, i);
if (!(pdptrs[i] & PT_PRESENT_MASK))
continue;
@@ -4129,7 +4137,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ if (w->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
mmu->root_role.level);
mmu->root.hpa = root;
@@ -4168,7 +4176,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
- if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ if (w->cpu_role.base.level == PT32E_ROOT_LEVEL) {
if (!(pdptrs[i] & PT_PRESENT_MASK)) {
mmu->pae_root[i] = INVALID_PAE_ROOT;
continue;
@@ -4182,7 +4190,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* directory. Othwerise each PAE page direct shadows one guest
* PAE page directory so that quadrant should be 0.
*/
- quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
+ quadrant = (w->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
mmu->pae_root[i] = root | pm_mask;
@@ -4206,6 +4214,7 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_pagewalk *w = mmu->w;
bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
u64 *pml5_root = NULL;
u64 *pml4_root = NULL;
@@ -4218,7 +4227,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
* on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/
if (mmu->root_role.direct ||
- mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+ w->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
mmu->root_role.level < PT64_ROOT_4LEVEL)
return 0;
@@ -4323,7 +4332,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu->w->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root.hpa;
if (!is_unsync_root(root))
@@ -4364,7 +4373,7 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
gpa_t vaddr, u64 access,
struct x86_exception *exception)
{
@@ -4376,7 +4385,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* user-mode address if CR0.PG=0. Therefore *include* ACC_USER_MASK in
* the last argument to kvm_translate_gpa (which NPT does not use).
*/
- return kvm_translate_gpa(vcpu, mmu, vaddr, access | PFERR_GUEST_FINAL_MASK,
+ return kvm_translate_gpa(vcpu, w, vaddr, access | PFERR_GUEST_FINAL_MASK,
exception, ACC_ALL);
}
@@ -4440,7 +4449,7 @@ static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
- struct rsvd_bits_validate *rsvd_check;
+ struct kvm_page_format *rsvd_check;
int root, leaf, level;
bool reserved = false;
@@ -4461,7 +4470,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
if (!is_shadow_present_pte(sptes[leaf]))
leaf++;
- rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
+ rsvd_check = &vcpu->arch.mmu->fmt;
for (level = root; level >= leaf; level--)
reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
@@ -4565,43 +4574,12 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
if (arch.direct_map)
arch.cr3 = (unsigned long)INVALID_GPA;
else
- arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu->w);
return kvm_setup_async_pf(vcpu, fault->addr,
kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
}
-void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
-{
- int r;
-
- if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
- return;
-
- if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
- work->wakeup_all)
- return;
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- return;
-
- if (!vcpu->arch.mmu->root_role.direct &&
- work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
- return;
-
- r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
- true, NULL, NULL);
-
- /*
- * Account fixed page faults, otherwise they'll never be counted, but
- * ignore stats for all other return times. Page-ready "faults" aren't
- * truly spurious and never trigger emulation
- */
- if (r == RET_PF_FIXED)
- vcpu->stat.pf_fixed++;
-}
-
static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int r)
{
@@ -4958,7 +4936,7 @@ out_unlock:
}
#endif
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
@@ -4968,6 +4946,71 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
+static int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 err, bool prefetch, int *emulation_type,
+ u8 *level)
+{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled =
+ is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
+ };
+ int r;
+
+ if (vcpu->arch.mmu->root_role.direct) {
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
+ /*
+ * With retpoline being active an indirect call is rather expensive,
+ * so do a direct call in the most common case.
+ */
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
+ r = kvm_tdp_page_fault(vcpu, &fault);
+ else
+ r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
+ if (fault.write_fault_to_shadow_pgtable && emulation_type)
+ *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
+
+ return r;
+}
+
+
static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 error_code, u8 *level)
{
@@ -5058,6 +5101,37 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
+ if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu->root_role.direct &&
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu->w))
+ return;
+
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+}
+
#ifdef CONFIG_KVM_GUEST_MEMFD
static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
{
@@ -5141,7 +5215,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_spte = NULL;
}
@@ -5317,7 +5390,7 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
#include "paging_tmpl.h"
#undef PTTYPE
-static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+static void __reset_rsvds_bits_mask(struct kvm_page_format *fmt,
u64 pa_bits_rsvd, int level, bool nx,
bool gbpages, bool pse, bool amd)
{
@@ -5325,7 +5398,7 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
u64 nonleaf_bit8_rsvd = 0;
u64 high_bits_rsvd;
- rsvd_check->bad_mt_xwr = 0;
+ fmt->bad_mt_xwr = 0;
if (!gbpages)
gbpages_bit_rsvd = rsvd_bits(7, 7);
@@ -5349,75 +5422,75 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
switch (level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
- rsvd_check->rsvd_bits_mask[0][1] = 0;
- rsvd_check->rsvd_bits_mask[0][0] = 0;
- rsvd_check->rsvd_bits_mask[1][0] =
- rsvd_check->rsvd_bits_mask[0][0];
+ fmt->rsvd_bits_mask[0][1] = 0;
+ fmt->rsvd_bits_mask[0][0] = 0;
+ fmt->rsvd_bits_mask[1][0] =
+ fmt->rsvd_bits_mask[0][0];
if (!pse) {
- rsvd_check->rsvd_bits_mask[1][1] = 0;
+ fmt->rsvd_bits_mask[1][1] = 0;
break;
}
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
- rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ fmt->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
- rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ fmt->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
break;
case PT32E_ROOT_LEVEL:
- rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ fmt->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
high_bits_rsvd |
rsvd_bits(5, 8) |
rsvd_bits(1, 2); /* PDPTE */
- rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
- rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
- rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ fmt->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ fmt->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ fmt->rsvd_bits_mask[1][1] = high_bits_rsvd |
rsvd_bits(13, 20); /* large page */
- rsvd_check->rsvd_bits_mask[1][0] =
- rsvd_check->rsvd_bits_mask[0][0];
+ fmt->rsvd_bits_mask[1][0] =
+ fmt->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_5LEVEL:
- rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ fmt->rsvd_bits_mask[0][4] = high_bits_rsvd |
nonleaf_bit8_rsvd |
rsvd_bits(7, 7);
- rsvd_check->rsvd_bits_mask[1][4] =
- rsvd_check->rsvd_bits_mask[0][4];
+ fmt->rsvd_bits_mask[1][4] =
+ fmt->rsvd_bits_mask[0][4];
fallthrough;
case PT64_ROOT_4LEVEL:
- rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ fmt->rsvd_bits_mask[0][3] = high_bits_rsvd |
nonleaf_bit8_rsvd |
rsvd_bits(7, 7);
- rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ fmt->rsvd_bits_mask[0][2] = high_bits_rsvd |
gbpages_bit_rsvd;
- rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
- rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
- rsvd_check->rsvd_bits_mask[1][3] =
- rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ fmt->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ fmt->rsvd_bits_mask[0][0] = high_bits_rsvd;
+ fmt->rsvd_bits_mask[1][3] =
+ fmt->rsvd_bits_mask[0][3];
+ fmt->rsvd_bits_mask[1][2] = high_bits_rsvd |
gbpages_bit_rsvd |
rsvd_bits(13, 29);
- rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ fmt->rsvd_bits_mask[1][1] = high_bits_rsvd |
rsvd_bits(13, 20); /* large page */
- rsvd_check->rsvd_bits_mask[1][0] =
- rsvd_check->rsvd_bits_mask[0][0];
+ fmt->rsvd_bits_mask[1][0] =
+ fmt->rsvd_bits_mask[0][0];
break;
}
}
static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+ struct kvm_pagewalk *w)
{
- __reset_rsvds_bits_mask(&context->guest_rsvd_check,
+ __reset_rsvds_bits_mask(&w->fmt,
vcpu->arch.reserved_gpa_bits,
- context->cpu_role.base.level, is_efer_nx(context),
+ w->cpu_role.base.level, is_efer_nx(w),
guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
- is_cr4_pse(context),
+ is_cr4_pse(w),
guest_cpuid_is_amd_compatible(vcpu));
}
-static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+static void __reset_rsvds_bits_mask_ept(struct kvm_page_format *fmt,
u64 pa_bits_rsvd, bool execonly,
int huge_page_level)
{
@@ -5430,18 +5503,18 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
if (huge_page_level < PG_LEVEL_2M)
large_2m_rsvd = rsvd_bits(7, 7);
- rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
- rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
- rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+ fmt->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ fmt->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ fmt->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ fmt->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
+ fmt->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
- rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
- rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
- rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
- rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+ fmt->rsvd_bits_mask[1][4] = fmt->rsvd_bits_mask[0][4];
+ fmt->rsvd_bits_mask[1][3] = fmt->rsvd_bits_mask[0][3];
+ fmt->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ fmt->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
+ fmt->rsvd_bits_mask[1][0] = fmt->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
@@ -5452,13 +5525,13 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
/* bits 0..2 must not be 100 unless VMX capabilities allow it */
bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
}
- rsvd_check->bad_mt_xwr = bad_mt_xwr;
+ fmt->bad_mt_xwr = bad_mt_xwr;
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly, int huge_page_level)
+ bool execonly, int huge_page_level)
{
- __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ __reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.fmt,
vcpu->arch.reserved_gpa_bits, execonly,
huge_page_level);
}
@@ -5480,13 +5553,13 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
bool is_amd = true;
/* KVM doesn't use 2-level page tables for the shadow MMU. */
bool is_pse = false;
- struct rsvd_bits_validate *shadow_zero_check;
+ struct kvm_page_format *fmt;
int i;
WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
- shadow_zero_check = &context->shadow_zero_check;
- __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ fmt = &context->fmt;
+ __reset_rsvds_bits_mask(fmt, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
@@ -5502,10 +5575,10 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
* Bits in shadow_me_mask but not in shadow_me_value are
* not allowed to be set.
*/
- shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
- shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
- shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
- shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
+ fmt->rsvd_bits_mask[0][i] |= shadow_me_mask;
+ fmt->rsvd_bits_mask[1][i] |= shadow_me_mask;
+ fmt->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+ fmt->rsvd_bits_mask[1][i] &= ~shadow_me_value;
}
}
@@ -5522,18 +5595,18 @@ static inline bool boot_cpu_is_amd(void)
*/
static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
- struct rsvd_bits_validate *shadow_zero_check;
+ struct kvm_page_format *fmt;
int i;
- shadow_zero_check = &context->shadow_zero_check;
+ fmt = &context->fmt;
if (boot_cpu_is_amd())
- __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ __reset_rsvds_bits_mask(fmt, reserved_hpa_bits(),
context->root_role.level, true,
boot_cpu_has(X86_FEATURE_GBPAGES),
false, true);
else
- __reset_rsvds_bits_mask_ept(shadow_zero_check,
+ __reset_rsvds_bits_mask_ept(fmt,
reserved_hpa_bits(), false,
max_huge_page_level);
@@ -5541,8 +5614,8 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
return;
for (i = context->root_role.level; --i >= 0;) {
- shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
- shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ fmt->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ fmt->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
}
}
@@ -5553,7 +5626,7 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
static void
reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
{
- __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ __reset_rsvds_bits_mask_ept(&context->fmt,
reserved_hpa_bits(), execonly,
max_huge_page_level);
}
@@ -5588,18 +5661,15 @@ reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
(14 & (access) ? 1 << 14 : 0) | \
(15 & (access) ? 1 << 15 : 0))
-static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
+static void __update_permission_bitmask(struct kvm_page_format *fmt, bool tdp,
+ bool ept, bool cr4_smep, bool cr4_smap,
+ bool cr0_wp, bool efer_nx)
{
unsigned index;
const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK);
const u16 r = ACC_BITS_MASK(ACC_READ_MASK);
- bool cr4_smep = is_cr4_smep(mmu);
- bool cr4_smap = is_cr4_smap(mmu);
- bool cr0_wp = is_cr0_wp(mmu);
- bool efer_nx = is_efer_nx(mmu);
-
/*
* In hardware, page fault error codes are generated (as the name
* suggests) on any kind of page fault. permission_fault() and
@@ -5612,7 +5682,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
* permission_fault() to indicate accesses that are *not* subject to
* SMAP restrictions.
*/
- for (index = 0; index < ARRAY_SIZE(mmu->permissions); ++index) {
+ for (index = 0; index < ARRAY_SIZE(fmt->permissions); ++index) {
unsigned pfec = index << 1;
/*
@@ -5686,10 +5756,23 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
}
- mmu->permissions[index] = ff | uf | wf | rf | smapf;
+ fmt->permissions[index] = ff | uf | wf | rf | smapf;
}
}
+static void update_permission_bitmask(struct kvm_pagewalk *w, bool tdp, bool ept)
+{
+ __update_permission_bitmask(&w->fmt, tdp, ept,
+ is_cr4_smep(w), is_cr4_smap(w),
+ is_cr0_wp(w), is_efer_nx(w));
+}
+
+static void update_spte_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
+{
+ __update_permission_bitmask(&mmu->fmt, tdp, ept,
+ mmu->root_role.cr4_smep, false, true, true);
+}
+
/*
* PKU is an additional mechanism by which the paging controls access to
* user-mode addresses based on the value in the PKRU register. Protection
@@ -5714,19 +5797,19 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
* away both AD and WD. For all reads or if the last condition holds, WD
* only will be masked away.
*/
-static void update_pkru_bitmask(struct kvm_mmu *mmu)
+static void update_pkru_bitmask(struct kvm_pagewalk *w)
{
unsigned bit;
bool wp;
- mmu->pkru_mask = 0;
+ w->fmt.pkru_mask = 0;
- if (!is_cr4_pke(mmu))
+ if (!is_cr4_pke(w))
return;
- wp = is_cr0_wp(mmu);
+ wp = is_cr0_wp(w);
- for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ for (bit = 0; bit < ARRAY_SIZE(w->fmt.permissions); ++bit) {
unsigned pfec, pkey_bits;
bool check_pkey, check_write, ff, uf, wf, pte_user;
@@ -5754,32 +5837,30 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu)
/* PKRU.WD stops write access. */
pkey_bits |= (!!check_write) << 1;
- mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ w->fmt.pkru_mask |= (pkey_bits & 3) << pfec;
}
}
static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu)
+ struct kvm_pagewalk *w)
{
- if (!is_cr0_pg(mmu))
+ if (!is_cr0_pg(w))
return;
- reset_guest_rsvds_bits_mask(vcpu, mmu);
- update_permission_bitmask(mmu, mmu == &vcpu->arch.guest_mmu, false);
- update_pkru_bitmask(mmu);
+ reset_guest_rsvds_bits_mask(vcpu, w);
+ update_permission_bitmask(w, w == &vcpu->arch.ngpa_walk, false);
+ update_pkru_bitmask(w);
}
static void paging64_init_context(struct kvm_mmu *context)
{
context->page_fault = paging64_page_fault;
- context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_spte = paging64_sync_spte;
}
static void paging32_init_context(struct kvm_mmu *context)
{
context->page_fault = paging32_page_fault;
- context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_spte = paging32_sync_spte;
}
@@ -5825,18 +5906,18 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
}
void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu)
+ struct kvm_pagewalk *w)
{
const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
- if (is_cr0_wp(mmu) == cr0_wp)
+ if (is_cr0_wp(w) == cr0_wp)
return;
- mmu->cpu_role.base.cr0_wp = cr0_wp;
- reset_guest_paging_metadata(vcpu, mmu);
+ w->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, w);
}
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
@@ -5894,52 +5975,37 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
struct kvm_mmu *context = &vcpu->arch.root_mmu;
union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
- if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
- root_role.word == context->root_role.word)
+ if (root_role.word == context->root_role.word)
return;
- context->cpu_role.as_u64 = cpu_role.as_u64;
context->root_role.word = root_role.word;
context->page_fault = kvm_tdp_page_fault;
context->sync_spte = NULL;
- context->get_guest_pgd = get_guest_cr3;
- context->get_pdptr = kvm_pdptr_read;
- context->inject_page_fault = kvm_inject_page_fault;
-
- if (!is_cr0_pg(context))
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- else if (is_cr4_pae(context))
- context->gva_to_gpa = paging64_gva_to_gpa;
- else
- context->gva_to_gpa = paging32_gva_to_gpa;
- reset_guest_paging_metadata(vcpu, context);
+ update_spte_permission_bitmask(context, true, shadow_xs_mask);
reset_tdp_shadow_zero_bits_mask(context);
}
static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- union kvm_cpu_role cpu_role,
union kvm_mmu_page_role root_role)
{
- if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
- root_role.word == context->root_role.word)
+ if (root_role.word == context->root_role.word)
return;
- context->cpu_role.as_u64 = cpu_role.as_u64;
context->root_role.word = root_role.word;
- if (!is_cr0_pg(context))
+ if (!is_cr0_pg(context->w))
nonpaging_init_context(context);
- else if (is_cr4_pae(context))
+ else if (is_cr4_pae(context->w))
paging64_init_context(context);
else
paging32_init_context(context);
- reset_guest_paging_metadata(vcpu, context);
+ update_spte_permission_bitmask(context, context == &vcpu->arch.guest_mmu, false);
reset_shadow_zero_bits_mask(vcpu, context);
}
-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+static void init_kvm_shadow_mmu(struct kvm_vcpu *vcpu,
union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
@@ -5961,7 +6027,28 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
*/
root_role.efer_nx = true;
- shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+ shadow_mmu_init_context(vcpu, context, root_role);
+}
+
+static void init_kvm_page_walk(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
+ union kvm_cpu_role cpu_role)
+{
+ if (cpu_role.as_u64 == w->cpu_role.as_u64)
+ return;
+
+ w->cpu_role.as_u64 = cpu_role.as_u64;
+ w->inject_page_fault = kvm_inject_page_fault;
+ w->get_pdptr = kvm_pdptr_read;
+ w->get_guest_pgd = get_guest_cr3;
+
+ if (!is_cr0_pg(w))
+ w->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_cr4_pae(w))
+ w->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ w->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, w);
}
void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
@@ -5980,13 +6067,15 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
cpu_role.base.cr4_smep = (misc_ctl & SVM_MISC_ENABLE_GMET) != 0;
+ init_kvm_page_walk(vcpu, &vcpu->arch.ngpa_walk, cpu_role);
+
root_role = cpu_role.base;
root_role.level = kvm_mmu_get_tdp_level(vcpu);
if (root_role.level == PT64_ROOT_5LEVEL &&
cpu_role.base.level == PT64_ROOT_4LEVEL)
root_role.passthrough = 1;
- shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+ shadow_mmu_init_context(vcpu, context, root_role);
kvm_mmu_new_pgd(vcpu, nested_cr3);
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
@@ -6027,18 +6116,22 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level, mbec);
- if (new_mode.as_u64 != context->cpu_role.as_u64) {
+ struct kvm_pagewalk *ngpa_walk = &vcpu->arch.ngpa_walk;
+
+ if (new_mode.as_u64 != ngpa_walk->cpu_role.as_u64) {
/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
- context->cpu_role.as_u64 = new_mode.as_u64;
+ ngpa_walk->cpu_role.as_u64 = new_mode.as_u64;
context->root_role.word = new_mode.base.word;
context->page_fault = ept_page_fault;
- context->gva_to_gpa = ept_gva_to_gpa;
+ ngpa_walk->gva_to_gpa = ept_gva_to_gpa;
context->sync_spte = ept_sync_spte;
- update_permission_bitmask(context, true, true);
- context->pkru_mask = 0;
- reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ update_permission_bitmask(ngpa_walk, true, true);
+ ngpa_walk->fmt.pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, execonly, huge_page_level);
+
+ update_spte_permission_bitmask(context, true, true);
reset_ept_shadow_zero_bits_mask(context, execonly);
}
@@ -6046,68 +6139,19 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu);
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
- union kvm_cpu_role cpu_role)
-{
- struct kvm_mmu *context = &vcpu->arch.root_mmu;
-
- kvm_init_shadow_mmu(vcpu, cpu_role);
-
- context->get_guest_pgd = get_guest_cr3;
- context->get_pdptr = kvm_pdptr_read;
- context->inject_page_fault = kvm_inject_page_fault;
-}
-
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
- union kvm_cpu_role new_mode)
-{
- struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
-
- if (new_mode.as_u64 == g_context->cpu_role.as_u64)
- return;
-
- g_context->cpu_role.as_u64 = new_mode.as_u64;
- g_context->get_guest_pgd = get_guest_cr3;
- g_context->get_pdptr = kvm_pdptr_read;
- g_context->inject_page_fault = kvm_inject_page_fault;
-
- /*
- * L2 page tables are never shadowed, so there is no need to sync
- * SPTEs.
- */
- g_context->sync_spte = NULL;
-
- /*
- * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
- * L1's nested page tables (e.g. EPT12). The nested translation
- * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
- * L2's page tables as the first level of translation and L1's
- * nested page tables as the second level of translation. Basically
- * the gva_to_gpa functions between mmu and nested_mmu are swapped.
- */
- if (!is_paging(vcpu))
- g_context->gva_to_gpa = nonpaging_gva_to_gpa;
- else if (is_long_mode(vcpu))
- g_context->gva_to_gpa = paging64_gva_to_gpa;
- else if (is_pae(vcpu))
- g_context->gva_to_gpa = paging64_gva_to_gpa;
- else
- g_context->gva_to_gpa = paging32_gva_to_gpa;
-
- reset_guest_paging_metadata(vcpu, g_context);
-}
-
void kvm_init_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
- if (mmu_is_nested(vcpu))
- init_kvm_nested_mmu(vcpu, cpu_role);
- else if (tdp_enabled)
- init_kvm_tdp_mmu(vcpu, cpu_role);
- else
- init_kvm_softmmu(vcpu, cpu_role);
+ init_kvm_page_walk(vcpu, &vcpu->arch.gva_walk, cpu_role);
+
+ if (!mmu_is_nested(vcpu)) {
+ if (tdp_enabled)
+ init_kvm_tdp_mmu(vcpu, cpu_role);
+ else
+ init_kvm_shadow_mmu(vcpu, cpu_role);
+ }
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu);
@@ -6127,10 +6171,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
*/
vcpu->arch.root_mmu.root_role.invalid = 1;
vcpu->arch.guest_mmu.root_role.invalid = 1;
- vcpu->arch.nested_mmu.root_role.invalid = 1;
- vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
- vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
- vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.ngpa_walk.cpu_role.ext.valid = 0;
+ vcpu->arch.gva_walk.cpu_role.ext.valid = 0;
kvm_mmu_reset_context(vcpu);
KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm);
@@ -6617,22 +6659,31 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
write_unlock(&vcpu->kvm->mmu_lock);
}
-void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
u64 addr, unsigned long roots)
{
+ struct kvm_mmu *mmu;
int i;
WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
/* It's actually a GPA for vcpu->arch.guest_mmu. */
- if (mmu != &vcpu->arch.guest_mmu) {
+ if (w == &vcpu->arch.gva_walk) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
if (is_noncanonical_invlpg_address(addr, vcpu))
return;
kvm_x86_call(flush_tlb_gva)(vcpu, addr);
+
+ if (tdp_enabled)
+ return;
+
+ mmu = &vcpu->arch.root_mmu;
+ } else {
+ mmu = &vcpu->arch.guest_mmu;
}
+ /* Invalidate shadow pages, whether GPA->GVA or nGPA->GPA. */
if (!mmu->sync_spte)
return;
@@ -6658,7 +6709,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
* be synced when switching to that new cr3, so nothing needs to be
* done here for them.
*/
- kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
+ kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.gva_walk, gva, KVM_MMU_ROOTS_ALL);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
@@ -6680,7 +6731,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (roots)
- kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
+ kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.gva_walk, gva, roots);
++vcpu->stat.invlpg;
/*
@@ -6725,11 +6776,12 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
free_page((unsigned long)mmu->pml5_root);
}
-static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct kvm_pagewalk *w)
{
struct page *page;
int i;
+ mmu->w = w;
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
mmu->mirror_root_hpa = INVALID_PAGE;
@@ -6795,13 +6847,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
- vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu, &vcpu->arch.ngpa_walk);
if (ret)
return ret;
- ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu, &vcpu->arch.gva_walk);
if (ret)
goto fail_allocate_root;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 73cdcbccc89e..c29002c60126 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -290,8 +290,6 @@ struct kvm_page_fault {
bool write_fault_to_shadow_pgtable;
};
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
-
/*
* Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
* and of course kvm_mmu_do_page_fault().
@@ -337,70 +335,6 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
fault->is_private);
}
-static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u64 err, bool prefetch,
- int *emulation_type, u8 *level)
-{
- struct kvm_page_fault fault = {
- .addr = cr2_or_gpa,
- .error_code = err,
- .exec = err & PFERR_FETCH_MASK,
- .write = err & PFERR_WRITE_MASK,
- .present = err & PFERR_PRESENT_MASK,
- .rsvd = err & PFERR_RSVD_MASK,
- .user = err & PFERR_USER_MASK,
- .prefetch = prefetch,
- .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
- .nx_huge_page_workaround_enabled =
- is_nx_huge_page_enabled(vcpu->kvm),
-
- .max_level = KVM_MAX_HUGEPAGE_LEVEL,
- .req_level = PG_LEVEL_4K,
- .goal_level = PG_LEVEL_4K,
- .is_private = err & PFERR_PRIVATE_ACCESS,
-
- .pfn = KVM_PFN_ERR_FAULT,
- };
- int r;
-
- if (vcpu->arch.mmu->root_role.direct) {
- /*
- * Things like memslots don't understand the concept of a shared
- * bit. Strip it so that the GFN can be used like normal, and the
- * fault.addr can be used when the shared bit is needed.
- */
- fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
- fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
- }
-
- /*
- * With retpoline being active an indirect call is rather expensive,
- * so do a direct call in the most common case.
- */
- if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
- r = kvm_tdp_page_fault(vcpu, &fault);
- else
- r = vcpu->arch.mmu->page_fault(vcpu, &fault);
-
- /*
- * Not sure what's happening, but punt to userspace and hope that
- * they can fix it by changing memory to shared, or they can
- * provide a better error.
- */
- if (r == RET_PF_EMULATE && fault.is_private) {
- pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
- kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
- return -EFAULT;
- }
-
- if (fault.write_fault_to_shadow_pgtable && emulation_type)
- *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
- if (level)
- *level = fault.goal_level;
-
- return r;
-}
-
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index df3ae0c7ec2c..e73fc09ec4db 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -55,7 +55,7 @@
#define PT_LEVEL_BITS 9
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
- #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
+ #define PT_HAVE_ACCESSED_DIRTY(w) (!(w)->cpu_role.base.ad_disabled)
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#else
#error Invalid PTTYPE value
@@ -106,13 +106,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
-static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+static inline void FNAME(protect_clean_gpte)(struct kvm_pagewalk *w, unsigned *access,
unsigned gpte)
{
unsigned mask;
/* dirty bit is not supported, so no need to track it */
- if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ if (!PT_HAVE_ACCESSED_DIRTY(w))
return;
BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
@@ -124,7 +124,7 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce
*access &= mask;
}
-static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu,
+static inline int FNAME(is_present_gpte)(struct kvm_pagewalk *w,
unsigned long pte)
{
#if PTTYPE != PTTYPE_EPT
@@ -134,38 +134,40 @@ static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu,
* For EPT, an entry is present if any of bits 2:0 are set.
* With mode-based execute control, bit 10 also indicates presence.
*/
- return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0));
+ return pte & (7 | (is_cr4_smep(w) ? VMX_EPT_USER_EXECUTABLE_MASK : 0));
#endif
}
-static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+static bool FNAME(is_bad_mt_xwr)(struct kvm_page_format *fmt, u64 gpte)
{
#if PTTYPE != PTTYPE_EPT
return false;
#else
- return __is_bad_mt_xwr(rsvd_check, gpte);
+ return __is_bad_mt_xwr(fmt, gpte);
#endif
}
-static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+static bool FNAME(is_rsvd_bits_set)(struct kvm_page_format *fmt, u64 gpte, int level)
{
- return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
- FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+ return __is_rsvd_bits_set(fmt, gpte, level) ||
+ FNAME(is_bad_mt_xwr)(fmt, gpte);
}
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte))
+ struct kvm_pagewalk *w = vcpu->arch.mmu->w;
+
+ if (!FNAME(is_present_gpte)(w, gpte))
goto no_present;
/* Prefetch only accessed entries (unless A/D bits are disabled). */
- if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
+ if (PT_HAVE_ACCESSED_DIRTY(w) &&
!(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
- if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
+ if (FNAME(is_rsvd_bits_set)(&w->fmt, gpte, PG_LEVEL_4K))
goto no_present;
return false;
@@ -206,7 +208,7 @@ static inline unsigned FNAME(gpte_access)(u64 gpte)
}
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu,
+ struct kvm_pagewalk *w,
struct guest_walker *walker,
gpa_t addr, int write_fault)
{
@@ -217,7 +219,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
int ret;
/* dirty/accessed bits are not supported, so no need to update them */
- if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ if (!PT_HAVE_ACCESSED_DIRTY(w))
return 0;
for (level = walker->max_level; level >= walker->level; --level) {
@@ -278,7 +280,7 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
return pkeys;
}
-static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+static inline bool FNAME(is_last_gpte)(struct kvm_pagewalk *w,
unsigned int level, unsigned int gpte)
{
/*
@@ -296,7 +298,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
* is not reserved and does not indicate a large page at this level,
* so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
*/
- gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
+ gpte &= level - (PT32_ROOT_LEVEL + w->cpu_role.ext.cr4_pse);
#endif
/*
* PG_LEVEL_4K always terminates. The RHS has bit 7 set
@@ -311,7 +313,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
* Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
gpa_t addr, u64 access)
{
int ret;
@@ -340,16 +342,16 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
- walker->level = mmu->cpu_role.base.level;
- pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
- have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
+ walker->level = w->cpu_role.base.level;
+ pte = kvm_mmu_get_guest_pgd(vcpu, w);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(w);
#if PTTYPE == 64
walk_nx_mask = 1ULL << PT64_NX_SHIFT;
if (walker->level == PT32E_ROOT_LEVEL) {
- pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
+ pte = w->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!FNAME(is_present_gpte)(mmu, pte))
+ if (!FNAME(is_present_gpte)(w, pte))
goto error;
--walker->level;
}
@@ -393,7 +395,7 @@ retry_walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(table_gfn),
nested_access | PFERR_GUEST_PAGE_MASK,
&walker->fault, 0);
@@ -422,10 +424,10 @@ retry_walk:
*/
pte_access = pt_access & (pte ^ walk_nx_mask);
- if (unlikely(!FNAME(is_present_gpte)(mmu, pte)))
+ if (unlikely(!FNAME(is_present_gpte)(w, pte)))
goto error;
- if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
+ if (unlikely(FNAME(is_rsvd_bits_set)(&w->fmt, pte, walker->level))) {
errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
@@ -434,14 +436,14 @@ retry_walk:
/* Convert to ACC_*_MASK flags for struct guest_walker. */
walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
- } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
+ } while (!FNAME(is_last_gpte)(w, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
- errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
+ errcode = permission_fault(vcpu, w, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
goto error;
@@ -453,7 +455,7 @@ retry_walk:
gfn += pse36_gfn_delta(pte);
#endif
- real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn),
+ real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(gfn),
access | PFERR_GUEST_FINAL_MASK,
&walker->fault, walker->pte_access);
if (real_gpa == INVALID_GPA)
@@ -462,7 +464,7 @@ retry_walk:
walker->gfn = real_gpa >> PAGE_SHIFT;
if (!write_fault)
- FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
+ FNAME(protect_clean_gpte)(w, &walker->pte_access, pte);
else
/*
* On a write fault, fold the dirty bit into accessed_dirty.
@@ -473,7 +475,7 @@ retry_walk:
(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
- ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, w, walker,
addr, write_fault);
if (unlikely(ret < 0))
goto error;
@@ -485,7 +487,7 @@ retry_walk:
error:
errcode |= write_fault | user_fault;
- if (fetch_fault && has_pferr_fetch(mmu))
+ if (fetch_fault && has_pferr_fetch(w))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;
@@ -540,13 +542,13 @@ error:
* ACC_*_MASK flags!
*/
walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
- if (mmu_has_mbec(mmu))
+ if (is_cr4_smep(w))
walker->fault.exit_qualification |=
EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
- walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.nested_page_fault = w != &vcpu->arch.gva_walk;
walker->fault.async_page_fault = false;
#if PTTYPE != PTTYPE_EPT
@@ -561,7 +563,7 @@ error:
static int FNAME(walk_addr)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
{
- return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
+ return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu->w, addr,
access);
}
@@ -577,7 +579,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu->w, &pte_access, gpte);
return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access);
}
@@ -660,7 +662,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
WARN_ON_ONCE(gw->gfn != base_gfn);
direct_access = gw->pte_access;
- top_level = vcpu->arch.mmu->cpu_role.base.level;
+ top_level = vcpu->arch.mmu->w->cpu_role.base.level;
if (top_level == PT32E_ROOT_LEVEL)
top_level = PT32_ROOT_LEVEL;
/*
@@ -849,7 +851,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
- !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
+ !is_cr0_wp(vcpu->arch.mmu->w) && !fault->user && fault->slot) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -859,7 +861,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* then we should prevent the kernel from executing it
* if SMEP is enabled.
*/
- if (is_cr4_smep(vcpu->arch.mmu))
+ if (is_cr4_smep(vcpu->arch.mmu->w))
walker.pte_access &= ~ACC_EXEC_MASK;
}
#endif
@@ -894,7 +896,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
gpa_t addr, u64 access,
struct x86_exception *exception)
{
@@ -904,10 +906,10 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
#ifndef CONFIG_X86_64
/* A 64-bit GVA should be impossible on 32-bit KVM. */
- WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
+ WARN_ON_ONCE((addr >> 32) && w == &vcpu->arch.gva_walk);
#endif
- r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
+ r = FNAME(walk_addr_generic)(&walker, vcpu, w, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
@@ -957,7 +959,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
pte_access &= FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu->w, &pte_access, gpte);
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
return 0;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 72d2394e089c..5fc27e9733b3 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -281,9 +281,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (prefetch && !synchronizing)
spte = mark_spte_for_access_track(spte);
- WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->fmt, spte, level),
"spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
- get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+ get_rsvd_bits(&vcpu->arch.mmu->fmt, spte, level));
/*
* Mark the memslot dirty *after* modifying it for access tracking.
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 13eea94dd212..e730717824b3 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -357,17 +357,6 @@ static inline bool is_last_spte(u64 pte, int level)
return (level == PG_LEVEL_4K) || is_large_pte(pte);
}
-static inline bool is_executable_pte(u64 spte)
-{
- /*
- * For now, return true if either the XS or XU bit is set
- * This function is only used for fast_page_fault,
- * which never processes shadow EPT, and regular page
- * tables always have XS==XU.
- */
- return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask;
-}
-
static inline kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -378,33 +367,33 @@ static inline bool is_accessed_spte(u64 spte)
return spte & shadow_accessed_mask;
}
-static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+static inline u64 get_rsvd_bits(struct kvm_page_format *fmt, u64 pte,
int level)
{
int bit7 = (pte >> 7) & 1;
- return rsvd_check->rsvd_bits_mask[bit7][level-1];
+ return fmt->rsvd_bits_mask[bit7][level-1];
}
-static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+static inline bool __is_rsvd_bits_set(struct kvm_page_format *fmt,
u64 pte, int level)
{
- return pte & get_rsvd_bits(rsvd_check, pte, level);
+ return pte & get_rsvd_bits(fmt, pte, level);
}
-static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+static inline bool __is_bad_mt_xwr(struct kvm_page_format *fmt,
u64 pte)
{
if (pte & VMX_EPT_USER_EXECUTABLE_MASK)
pte |= VMX_EPT_EXECUTABLE_MASK;
- return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+ return fmt->bad_mt_xwr & BIT_ULL(pte & 0x3f);
}
-static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+static __always_inline bool is_rsvd_spte(struct kvm_page_format *fmt,
u64 spte, int level)
{
- return __is_bad_mt_xwr(rsvd_check, spte) ||
- __is_rsvd_bits_set(rsvd_check, spte, level);
+ return __is_bad_mt_xwr(fmt, spte) ||
+ __is_rsvd_bits_set(fmt, spte, level);
}
/*
@@ -496,20 +485,40 @@ static inline bool is_mmu_writable_spte(u64 spte)
}
/*
- * Returns true if the access indicated by @fault is allowed by the existing
- * SPTE protections. Note, the caller is responsible for checking that the
- * SPTE is a shadow-present, leaf SPTE (either before or after).
+ * Returns true if the access indicated by @fault is forbidden by the existing
+ * SPTE protections.
*/
-static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+static inline bool spte_permission_fault(struct kvm_mmu *mmu, u64 spte,
+ struct kvm_page_fault *fault)
{
- if (fault->exec)
- return is_executable_pte(spte);
+ unsigned pfec, pte_access;
- if (fault->write)
- return is_writable_pte(spte);
+ if (!is_shadow_present_pte(spte))
+ return true;
- /* Fault was on Read access */
- return spte & PT_PRESENT_MASK;
+ BUILD_BUG_ON(PT_PRESENT_MASK != ACC_READ_MASK);
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+ BUILD_BUG_ON(VMX_EPT_READABLE_MASK != ACC_READ_MASK);
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ /* strip nested paging fault error codes */
+ pte_access = spte & (PT_PRESENT_MASK | PT_WRITABLE_MASK);
+ if (shadow_nx_mask) {
+ pte_access |= spte & shadow_user_mask ? ACC_USER_MASK : 0;
+ pte_access |= spte & shadow_nx_mask ? 0 : ACC_EXEC_MASK;
+ } else {
+ pte_access |= spte & shadow_xs_mask ? ACC_EXEC_MASK : 0;
+ pte_access |= spte & shadow_xu_mask ? ACC_USER_EXEC_MASK : 0;
+ }
+
+ /*
+ * RSVD is handled elsewhere, and is used for SMAP in the context
+ * of accessing fmt.permissions[]. SPTEs never use PK or SS, as
+ * they are not supported for shadow paging and irrelevant for TDP.
+ */
+ pfec = fault->error_code & (
+ PFERR_WRITE_MASK | PFERR_USER_MASK | PFERR_FETCH_MASK);
+ return (mmu->fmt.permissions[pfec >> 1] >> pte_access) & 1;
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c1cbae65d239..ce3f2efadb05 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1122,6 +1122,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault,
struct tdp_iter *iter)
{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
u64 new_spte;
int ret = RET_PF_FIXED;
@@ -1131,7 +1132,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
return RET_PF_RETRY;
if (is_shadow_present_pte(iter->old_spte) &&
- (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ (fault->prefetch || !spte_permission_fault(mmu, iter->old_spte, fault)) &&
is_last_spte(iter->old_spte, iter->level)) {
WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
return RET_PF_SPURIOUS;
diff --git a/arch/x86/kvm/msrs.c b/arch/x86/kvm/msrs.c
new file mode 100644
index 000000000000..c230b18d87e3
--- /dev/null
+++ b/arch/x86/kvm/msrs.c
@@ -0,0 +1,2745 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+#include <asm/intel_pt.h>
+#include <asm/vmx.h>
+
+#include "hyperv.h"
+#include "lapic.h"
+#include "msrs.h"
+#include "pmu.h"
+#include "trace.h"
+#include "vmx/vmx.h"
+#include "xen.h"
+#include "x86.h"
+
+bool __read_mostly ignore_msrs = 0;
+module_param(ignore_msrs, bool, 0644);
+
+bool __read_mostly report_ignored_msrs = true;
+module_param(report_ignored_msrs, bool, 0644);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
+
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static
+u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
+#else
+static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
+#endif
+
+#define MAX_IO_MSRS 256
+
+struct msr_bitmap_range {
+ u32 flags;
+ u32 nmsrs;
+ u32 base;
+ unsigned long *bitmap;
+};
+
+struct kvm_x86_msr_filter {
+ u8 count;
+ bool default_allow:1;
+ struct msr_bitmap_range ranges[16];
+};
+
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
+
+struct kvm_user_return_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ struct kvm_user_return_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
+};
+
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
+static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
+
+void kvm_destroy_user_return_msrs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
+
+ kvm_nr_uret_msrs = 0;
+}
+
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
+
+ msrs->registered = false;
+ user_return_notifier_unregister(urn);
+
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
+ values = &msrs->values[slot];
+ if (values->host != values->curr) {
+ wrmsrq(kvm_uret_msrs_list[slot], values->host);
+ values->curr = values->host;
+ }
+ }
+}
+
+static int kvm_probe_user_return_msr(u32 msr)
+{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrq_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrq_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
+{
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
+
+void kvm_user_return_msr_cpu_online(void)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
+ u64 value;
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrq_safe(kvm_uret_msrs_list[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
+ }
+}
+
+static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
+{
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
+ }
+}
+
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
+ int err;
+
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
+ return 0;
+ err = wrmsrq_safe(kvm_uret_msrs_list[slot], value);
+ if (err)
+ return 1;
+
+ msrs->values[slot].curr = value;
+ kvm_user_return_register_notifier(msrs);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
+
+u64 kvm_get_user_return_msr(unsigned int slot)
+{
+ return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
+
+void drop_user_return_notifiers(void)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
+
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
+}
+
+/*
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
+ */
+
+static const u32 msrs_to_save_base[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS,
+
+ MSR_IA32_U_CET, MSR_IA32_S_CET,
+ MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
+ MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
+ MSR_IA32_DEBUGCTLMSR,
+ MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP,
+};
+
+static const u32 msrs_to_save_pmu[] = {
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
+};
+
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
+static unsigned num_msrs_to_save;
+
+static const u32 emulated_msrs_all[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+
+#ifdef CONFIG_KVM_HYPERV
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+#endif
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSC_DEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
+};
+
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+static unsigned num_emulated_msrs;
+
+/*
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
+ */
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_PLATFORM_INFO,
+};
+
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+static unsigned int num_msr_based_features;
+
+int kvm_get_msr_index_list(struct kvm_msr_list __user *user_msr_list)
+{
+ struct kvm_msr_list msr_list;
+ unsigned int n;
+
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ return -EFAULT;
+
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ return -EFAULT;
+
+ if (n < msr_list.nmsrs)
+ return -E2BIG;
+
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+ num_msrs_to_save * sizeof(u32)))
+ return -EFAULT;
+
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
+ &emulated_msrs, num_emulated_msrs * sizeof(u32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int kvm_get_feature_msr_index_list(struct kvm_msr_list __user *user_msr_list)
+{
+ struct kvm_msr_list msr_list;
+ unsigned int n;
+
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ return -EFAULT;
+
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msr_based_features;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ return -EFAULT;
+
+ if (n < msr_list.nmsrs)
+ return -E2BIG;
+
+ if (copy_to_user(user_msr_list->indices, &msr_based_features,
+ num_msr_based_features * sizeof(u32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+ int i;
+
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+static bool kvm_is_advertised_msr(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ for (i = 0; i < num_emulated_msrs; i++) {
+ if (emulated_msrs[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
+
+/*
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ * does not yet virtualize. These include:
+ * 10 - MISC_PACKAGE_CTRLS
+ * 11 - ENERGY_FILTERING_CTL
+ * 12 - DOITM
+ * 18 - FB_CLEAR_CTRL
+ * 21 - XAPIC_DISABLE_STATUS
+ * 23 - OVERCLOCKING_STATUS
+ */
+
+#define KVM_SUPPORTED_ARCH_CAP \
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
+
+u64 kvm_get_arch_capabilities(void)
+{
+ u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+
+ /*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ data |= ARCH_CAP_RFDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ data |= ARCH_CAP_ITS_NO;
+
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * If RTM=0 because the kernel has disabled TSX, the host might
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
+ * and therefore knows that there cannot be TAA) but keep
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+ * and we want to allow migrating those guests to tsx=off hosts.
+ */
+ data &= ~ARCH_CAP_TAA_NO;
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ data |= ARCH_CAP_TAA_NO;
+ } else {
+ /*
+ * Nothing to do here; we emulate TSX_CTRL if present on the
+ * host so the guest can choose between disabling TSX or
+ * using VERW to clear CPU buffers.
+ */
+ }
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
+ return data;
+}
+
+static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ WARN_ON_ONCE(!host_initiated);
+
+ switch (index) {
+ case MSR_IA32_ARCH_CAPABILITIES:
+ *data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ *data = kvm_caps.supported_perf_cap;
+ break;
+ case MSR_PLATFORM_INFO:
+ *data = MSR_PLATFORM_INFO_CPUID_FAULT;
+ break;
+ case MSR_IA32_UCODE_REV:
+ rdmsrq_safe(index, data);
+ break;
+ default:
+ return kvm_x86_call(get_feature_msr)(index, data);
+ }
+ return 0;
+}
+
+typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated);
+
+static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
+ u64 *data, bool host_initiated,
+ enum kvm_msr_access rw,
+ msr_access_t msr_access_fn)
+{
+ const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
+ int ret;
+
+ BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
+
+ /*
+ * Zero the data on read failures to avoid leaking stack data to the
+ * guest and/or userspace, e.g. if the failure is ignored below.
+ */
+ ret = msr_access_fn(vcpu, msr, data, host_initiated);
+ if (ret && rw == MSR_TYPE_R)
+ *data = 0;
+
+ if (ret != KVM_MSR_RET_UNSUPPORTED)
+ return ret;
+
+ /*
+ * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
+ * advertises to userspace, even if an MSR isn't fully supported.
+ * Simply check that @data is '0', which covers both the write '0' case
+ * and all reads (in which case @data is zeroed on failure; see above).
+ */
+ if (host_initiated && !*data && kvm_is_advertised_msr(msr))
+ return 0;
+
+ if (!ignore_msrs) {
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, *data);
+ return ret;
+ }
+
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
+
+ return 0;
+}
+
+static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
+ kvm_get_feature_msr);
+}
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS))
+ return false;
+
+ if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT))
+ return false;
+
+ if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
+ return false;
+
+ if (efer & (EFER_LME | EFER_LMA) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return false;
+
+ if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX))
+ return false;
+
+ return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
+ int r;
+
+ if (efer & efer_reserved_bits)
+ return 1;
+
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
+
+ r = kvm_x86_call(set_efer)(vcpu, efer);
+ if (r) {
+ WARN_ON(r > 0);
+ return r;
+ }
+
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ if (!static_cpu_has(X86_FEATURE_XSAVES) &&
+ (efer & EFER_SVME))
+ kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+
+ return 0;
+}
+
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits);
+
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
+ struct kvm *kvm = vcpu->kvm;
+ bool allowed;
+ int idx;
+ u32 i;
+
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
+ return true;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ allowed = test_bit(index - start, bitmap);
+ break;
+ }
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return allowed;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed);
+
+/*
+ * Write @data into the MSR specified by @index. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+
+ switch (index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ data = __canonical_address(data, max_host_virt_addr_bits());
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if the guest CPU is Intel compatible, otherwise
+ * clear the bits. This ensures cross-vendor migration will
+ * provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (!kvm_is_valid_u_s_cet(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ if (!host_initiated)
+ return 1;
+ fallthrough;
+ /*
+ * Note that the MSR emulation here is flawed when a vCPU
+ * doesn't support the Intel 64 architecture. The expected
+ * architectural behavior in this case is that the upper 32
+ * bits do not exist and should always read '0'. However,
+ * because the actual hardware on which the virtual CPU is
+ * running does support Intel 64, XRSTORS/XSAVES in the
+ * guest could observe behavior that violates the
+ * architecture. Intercepting XRSTORS/XSAVES for this
+ * special case isn't deemed worthwhile.
+ */
+ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return KVM_MSR_RET_UNSUPPORTED;
+ /*
+ * MSR_IA32_INT_SSP_TAB is not present on processors that do
+ * not support Intel 64 architecture.
+ */
+ if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+ /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
+ if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4))
+ return 1;
+ break;
+ }
+
+ msr.data = data;
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ return kvm_x86_call(set_msr)(vcpu, &msr);
+}
+
+static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ return __kvm_set_msr(vcpu, index, *data, host_initiated);
+}
+
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+{
+ return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
+ _kvm_set_msr);
+}
+
+/*
+ * Read the MSR specified by @index into @data. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+ int ret;
+
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ if (!host_initiated)
+ return 1;
+ fallthrough;
+ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return KVM_MSR_RET_UNSUPPORTED;
+ break;
+ }
+
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ ret = kvm_x86_call(get_msr)(vcpu, &msr);
+ if (!ret)
+ *data = msr.data;
+ return ret;
+}
+
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+{
+ return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
+ __kvm_get_msr);
+}
+
+int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return __kvm_set_msr(vcpu, index, data, true);
+}
+
+int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return __kvm_get_msr(vcpu, index, data, true);
+}
+
+int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read);
+
+int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write);
+
+int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+
+ return __kvm_emulate_msr_read(vcpu, index, data);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read);
+
+int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+
+ return __kvm_emulate_msr_write(vcpu, index, data);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write);
+
+static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
+ kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
+ return EXIT_FASTPATH_NONE;
+ break;
+ case MSR_IA32_TSC_DEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+
+ trace_kvm_msr_write(msr, data);
+
+ if (!kvm_skip_emulated_instruction(vcpu))
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+
+fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __handle_fastpath_wrmsr(vcpu, kvm_ecx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr);
+
+fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm);
+
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error) {
+ kvm_eax_write(vcpu, vcpu->run->msr.data);
+ kvm_edx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+}
+
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_COMPLETE_USER_EXIT);
+}
+
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_emulated_msr_access(vcpu);
+}
+
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_fast_msr_access(vcpu);
+}
+
+static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error)
+ kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
+ vcpu->run->msr.data);
+
+ return complete_fast_msr_access(vcpu);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case KVM_MSR_RET_UNSUPPORTED:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case KVM_MSR_RET_FILTERED:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg,
+ int (*complete_rdmsr)(struct kvm_vcpu *))
+{
+ u64 data;
+ int r;
+
+ r = kvm_emulate_msr_read(vcpu, msr, &data);
+
+ if (!r) {
+ trace_kvm_msr_read(msr, data);
+
+ if (reg < 0) {
+ kvm_eax_write(vcpu, data);
+ kvm_edx_write(vcpu, data >> 32);
+ } else {
+ kvm_register_write(vcpu, reg, data);
+ }
+ } else {
+ /* MSR read failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0,
+ complete_rdmsr, r))
+ return 0;
+ trace_kvm_msr_read_ex(msr);
+ }
+
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
+}
+
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_rdmsr(vcpu, kvm_ecx_read(vcpu), -1,
+ complete_fast_rdmsr);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr);
+
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ vcpu->arch.cui_rdmsr_imm_reg = reg;
+
+ return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm);
+
+static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int r;
+
+ r = kvm_emulate_msr_write(vcpu, msr, data);
+ if (!r) {
+ trace_kvm_msr_write(msr, data);
+ } else {
+ /* MSR write failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data,
+ complete_fast_msr_access, r))
+ return 0;
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
+ trace_kvm_msr_write_ex(msr, data);
+ }
+
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
+}
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_wrmsr(vcpu, kvm_ecx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr);
+
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm);
+
+int kvm_emulator_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index,
+ u64 *pdata)
+{
+ int r;
+
+ r = kvm_emulate_msr_read(vcpu, msr_index, pdata);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_read_ex(msr_index);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ trace_kvm_msr_read(msr_index, *pdata);
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_emulator_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index,
+ u64 data)
+{
+ int r;
+
+ r = kvm_emulate_msr_write(vcpu, msr_index, data);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_write_ex(msr_index, data);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ trace_kvm_msr_write(msr_index, data);
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_emulator_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ /*
+ * Treat emulator accesses to the current shadow stack pointer as host-
+ * initiated, as they aren't true MSR accesses (SSP is a "just a reg"),
+ * and this API is used only for implicit accesses, i.e. not RDMSR, and
+ * so the index is fully KVM-controlled.
+ */
+ if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP))
+ return kvm_msr_read(vcpu, msr_index, pdata);
+
+ return __kvm_emulate_msr_read(vcpu, msr_index, pdata);
+}
+
+/*
+ * Returns true if the MSR in question is managed via XSTATE, i.e. is context
+ * switched with the rest of guest FPU state.
+ *
+ * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
+ */
+static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (!vcpu)
+ return false;
+
+ switch (msr) {
+ case MSR_IA32_U_CET:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_IBT);
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+ default:
+ return false;
+ }
+}
+
+/*
+ * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
+ * MSR that is managed via XSTATE. Note, the caller is responsible for doing
+ * the initial FPU load, this helper only ensures that guest state is resident
+ * in hardware (the kernel can load its FPU state in IRQ context).
+ *
+ * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
+ * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
+ * consumed when transitioning to lower privilege levels, i.e. are effectively
+ * only consumed by userspace as well.
+ */
+static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info,
+ int access)
+{
+ BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W);
+
+ KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
+ KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
+
+ kvm_fpu_get();
+ if (access == MSR_TYPE_R)
+ rdmsrq(msr_info->index, msr_info->data);
+ else
+ wrmsrq(msr_info->index, msr_info->data);
+ kvm_fpu_put();
+}
+
+static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W);
+}
+
+static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R);
+}
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+{
+ int version;
+ int r;
+ struct pvclock_wall_clock wc;
+ u32 wc_sec_hi;
+ u64 wall_nsec;
+
+ if (!wall_clock)
+ return;
+
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
+
+ if (version & 1)
+ ++version; /* first time write, random junk */
+
+ ++version;
+
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
+
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
+
+ wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
+ wc.version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ if (sec_hi_ofs) {
+ wc_sec_hi = wall_nsec >> 32;
+ kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+ &wc_sec_hi, sizeof(wc_sec_hi));
+ }
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+ bool old_msr, bool host_initiated)
+{
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
+ }
+
+ vcpu->arch.time = system_time;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ if (system_time & 1)
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
+ else
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
+
+ return;
+}
+
+/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
+static bool is_mci_control_msr(u32 msr)
+{
+ return (msr & 3) == 0;
+}
+static bool is_mci_status_msr(u32 msr)
+{
+ return (msr & 3) == 1;
+}
+
+/*
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
+ */
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
+{
+ /* McStatusWrEn enabled? */
+ if (guest_cpuid_is_amd_compatible(vcpu))
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
+
+ return false;
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u32 offset, last_msr;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return 1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
+ return 1;
+ /* An attempt to write a 1 to a reserved bit raises #GP */
+ if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ vcpu->arch.mci_ctl2_banks[offset] = data;
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ /*
+ * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
+ * values are architecturally undefined. But, some Linux
+ * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
+ * issue on AMD K8s, allow bit 10 to be clear when setting all
+ * other bits in order to avoid an uncaught #GP in the guest.
+ *
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
+ * single-bit ECC data errors.
+ */
+ if (is_mci_control_msr(msr) &&
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return 1;
+
+ /*
+ * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
+ * AMD-based CPUs allow non-zero values, but if and only if
+ * HWCR[McStatusWrEn] is set.
+ */
+ if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
+ data != 0 && !can_set_mci_status(vcpu))
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ default:
+ return 1;
+ }
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 4:5 are reserved, Should be zero */
+ if (data & 0x30)
+ return 1;
+
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+ return 1;
+
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return data ? 1 : 0;
+
+ if (__kvm_pv_async_pf_enabled(data) &&
+ kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u64)))
+ return 1;
+
+ vcpu->arch.apf.msr_en_val = data;
+
+ if (__kvm_pv_async_pf_enabled(data)) {
+ kvm_async_pf_wakeup_all(vcpu);
+ } else {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ }
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
+{
+ /* Bits 8-63 are reserved */
+ if (data >> 8)
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ vcpu->arch.apf.msr_int_val = data;
+
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /*
+ * Do not allow host-initiated writes to trigger the Xen hypercall
+ * page setup; it could incur locking paths which are not expected
+ * if userspace sets the MSR in an unusual location.
+ */
+ if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
+ !msr_info->host_initiated)
+ return kvm_xen_write_hypercall_page(vcpu, data);
+
+ switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
+ case MSR_F15H_EX_CFG:
+ break;
+
+ case MSR_IA32_UCODE_REV:
+ if (msr_info->host_initiated)
+ vcpu->arch.microcode_version = data;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return KVM_MSR_RET_UNSUPPORTED;
+ vcpu->arch.arch_capabilities = data;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (data & ~kvm_caps.supported_perf_cap)
+ return 1;
+
+ /*
+ * Note, this is not just a performance optimization! KVM
+ * disallows changing feature MSRs after the vCPU has run; PMU
+ * refresh will bug the VM if called after the vCPU has run.
+ */
+ if (vcpu->arch.perf_capabilities == data)
+ break;
+
+ vcpu->arch.perf_capabilities = data;
+ kvm_pmu_refresh(vcpu);
+ kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
+ break;
+ case MSR_IA32_PRED_CMD: {
+ u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+ if (!msr_info->host_initiated) {
+ if ((!guest_has_pred_cmd_msr(vcpu)))
+ return 1;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!boot_cpu_has(X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+
+ if (data & reserved_bits)
+ return 1;
+
+ if (!data)
+ break;
+
+ wrmsrq(MSR_IA32_PRED_CMD, data);
+ break;
+ }
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
+ case MSR_EFER:
+ return set_efer(vcpu, msr_info);
+ case MSR_K7_HWCR: {
+ /*
+ * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+ * through at least v6.6 whine if TscFreqSel is clear,
+ * depending on F/M/S.
+ */
+ u64 valid = BIT_ULL(18) | BIT_ULL(24);
+
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_GP_ON_USER_CPUID))
+ valid |= MSR_K7_HWCR_CPUID_USER_DIS;
+
+ if (data & ~valid) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ vcpu->arch.msr_hwcr = data;
+ break;
+ }
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ break;
+ case MSR_IA32_CR_PAT:
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ vcpu->arch.pat = data;
+ break;
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return kvm_mtrr_set_msr(vcpu, msr, data);
+ case MSR_IA32_APICBASE:
+ return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_TSC_DEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (!msr_info->host_initiated) {
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
+ case MSR_IA32_MISC_ENABLE: {
+ u64 old_val = vcpu->arch.ia32_misc_enable_msr;
+
+ if (!msr_info->host_initiated) {
+ /* RO bits */
+ if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
+ return 1;
+
+ /* R bits, i.e. writes are ignored, but don't fault. */
+ data = data & ~MSR_IA32_MISC_ENABLE_EMON;
+ data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
+ }
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
+ ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3))
+ return 1;
+ vcpu->arch.ia32_misc_enable_msr = data;
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ } else {
+ vcpu->arch.ia32_misc_enable_msr = data;
+ }
+ break;
+ }
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
+ case MSR_IA32_POWER_CTL:
+ vcpu->arch.msr_ia32_power_ctl = data;
+ break;
+ case MSR_IA32_TSC:
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, &data);
+ } else if (!vcpu->arch.guest_tsc_protected) {
+ u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
+ break;
+ case MSR_IA32_XSS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (data & ~vcpu->arch.guest_supported_xss)
+ return 1;
+ if (vcpu->arch.ia32_xss == data)
+ break;
+ vcpu->arch.ia32_xss = data;
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ break;
+ case MSR_SMI_COUNT:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smi_count = data;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (data & 0x1) {
+ /*
+ * Pairs with the smp_mb__after_atomic() in
+ * kvm_arch_async_page_present_queued().
+ */
+ smp_store_mb(vcpu->arch.apf.pageready_pending, false);
+
+ kvm_check_async_pf_completion(vcpu);
+ }
+ break;
+ case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (unlikely(!sched_info_on()))
+ return 1;
+
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
+
+ vcpu->arch.st.msr_val = data;
+
+ if (!(data & KVM_MSR_ENABLED))
+ break;
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
+ return 1;
+ break;
+
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ /* only enable bit supported */
+ if (data & (-1ULL << 1))
+ return 1;
+
+ vcpu->arch.msr_kvm_poll_control = data;
+ break;
+
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ return set_msr_mce(vcpu, msr_info);
+
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to specify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return kvm_hv_set_msr_common(vcpu, msr, data,
+ msr_info->host_initiated);
+#endif
+ case MSR_IA32_BBL_CR_CTL3:
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.length = data;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.status = data;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.msr_platform_info = data;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+ (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ !(vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT)))
+ return 1;
+ vcpu->arch.msr_misc_features_enables = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
+
+ fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~kvm_guest_supported_xfd(vcpu))
+ return 1;
+
+ vcpu->arch.guest_fpu.xfd_err = data;
+ break;
+#endif
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ kvm_set_xstate_msr(vcpu, msr_info);
+ break;
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ return KVM_MSR_RET_UNSUPPORTED;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common);
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 offset, last_msr;
+
+ switch (msr) {
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ data = 0;
+ break;
+ case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) && !host)
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ if (!(mcg_cap & MCG_CMCI_P) && !host)
+ return 1;
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+ last_msr + 1 - MSR_IA32_MC0_CTL2);
+ data = vcpu->arch.mci_ctl2_banks[offset];
+ break;
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+ if (msr > last_msr)
+ return 1;
+
+ offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+ last_msr + 1 - MSR_IA32_MC0_CTL);
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ default:
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ switch (msr_info->index) {
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ case MSR_AMD64_SYSCFG:
+ case MSR_K8_TSEG_ADDR:
+ case MSR_K8_TSEG_MASK:
+ case MSR_VM_HSAVE_PA:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_IA32_PERF_CTL:
+ case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
+ case MSR_F15H_EX_CFG:
+ /*
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
+ * limit) MSRs. Just return 0, as we do not want to expose the host
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
+ * so for existing CPU-specific MSRs.
+ */
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
+ msr_info->data = 0;
+ break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ msr_info->data = 0;
+ break;
+ case MSR_IA32_UCODE_REV:
+ msr_info->data = vcpu->arch.microcode_version;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return KVM_MSR_RET_UNSUPPORTED;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
+ return KVM_MSR_RET_UNSUPPORTED;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ break;
+ case MSR_IA32_POWER_CTL:
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
+ break;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * On userspace reads and writes, however, we unconditionally
+ * return L1's TSC value to ensure backwards-compatible
+ * behavior for migration.
+ */
+ u64 offset, ratio;
+
+ if (msr_info->host_initiated) {
+ offset = vcpu->arch.l1_tsc_offset;
+ ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ } else {
+ offset = vcpu->arch.tsc_offset;
+ ratio = vcpu->arch.tsc_scaling_ratio;
+ }
+
+ msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
+ break;
+ }
+ case MSR_IA32_CR_PAT:
+ msr_info->data = vcpu->arch.pat;
+ break;
+ case MSR_MTRRcap:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
+ case 0xcd: /* fsb frequency */
+ msr_info->data = 3;
+ break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ msr_info->data = 1 << 24;
+ break;
+ case MSR_IA32_APICBASE:
+ msr_info->data = vcpu->arch.apic_base;
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
+ case MSR_IA32_TSC_DEADLINE:
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
+ break;
+ case MSR_SMI_COUNT:
+ msr_info->data = vcpu->arch.smi_count;
+ break;
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ msr_info->data = 1000ULL;
+ /* CPU multiplier */
+ msr_info->data |= (((uint64_t)4ULL) << 40);
+ break;
+ case MSR_EFER:
+ msr_info->data = vcpu->arch.efer;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.apf.msr_en_val;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.apf.msr_int_val;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = 0;
+ break;
+ case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.st.msr_val;
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
+ break;
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ msr_info->data = 0x20000000;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return kvm_hv_get_msr_common(vcpu,
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+#endif
+ case MSR_IA32_BBL_CR_CTL3:
+ /* This legacy MSR exists but isn't fully documented in current
+ * silicon. It is however accessed by winxp in very narrow
+ * scenarios where it sets bit #19, itself documented as
+ * a "reserved" bit. Best effort attempt to source coherent
+ * read data here should the balance of the register be
+ * interpreted by the guest:
+ *
+ * L2 cache control register 3: 64GB range, 256KB size,
+ * enabled, latency 0x1, configured
+ */
+ msr_info->data = 0xbe702111;
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.length;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.status;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated &&
+ !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+ return 1;
+ msr_info->data = vcpu->arch.msr_platform_info;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ msr_info->data = vcpu->arch.msr_misc_features_enables;
+ break;
+ case MSR_K7_HWCR:
+ msr_info->data = vcpu->arch.msr_hwcr;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+ break;
+#endif
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ kvm_get_xstate_msr(vcpu, msr_info);
+ break;
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info);
+
+ return KVM_MSR_RET_UNSUPPORTED;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common);
+
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
+}
+
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ u64 val;
+
+ /*
+ * Reject writes to immutable feature MSRs if the vCPU model is frozen,
+ * as KVM doesn't support modifying the guest vCPU model on the fly,
+ * e.g. changing the VMX capabilities MSRs while L2 is active is
+ * nonsensical. Allow writes of the same value, e.g. so that userspace
+ * can blindly stuff all MSRs when emulating RESET.
+ */
+ if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) &&
+ kvm_is_immutable_feature_msr(index) &&
+ (do_get_msr(vcpu, index, &val) || *data != val))
+ return -EINVAL;
+
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
+}
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data))
+{
+ bool fpu_loaded = false;
+ int i;
+
+ for (i = 0; i < msrs->nmsrs; ++i) {
+ /*
+ * If userspace is accessing one or more XSTATE-managed MSRs,
+ * temporarily load the guest's FPU state so that the guest's
+ * MSR value(s) is resident in hardware and thus can be accessed
+ * via RDMSR/WRMSR.
+ */
+ if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) {
+ kvm_load_guest_fpu(vcpu);
+ fpu_loaded = true;
+ }
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+ }
+ if (fpu_loaded)
+ kvm_put_guest_fpu(vcpu);
+
+ return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data),
+ int writeback)
+{
+ struct kvm_msrs msrs;
+ struct kvm_msr_entry *entries;
+ unsigned size;
+ int r;
+
+ r = -EFAULT;
+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
+ goto out;
+
+ r = -E2BIG;
+ if (msrs.nmsrs >= MAX_IO_MSRS)
+ goto out;
+
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+ entries = memdup_user(user_msrs->entries, size);
+ if (IS_ERR(entries)) {
+ r = PTR_ERR(entries);
+ goto out;
+ }
+
+ r = __msr_io(vcpu, &msrs, entries, do_msr);
+
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
+ r = -EFAULT;
+
+ kfree(entries);
+out:
+ return r;
+}
+
+int kvm_get_feature_msrs(struct kvm_msrs __user *user_msrs)
+{
+ return msr_io(NULL, user_msrs, do_get_feature_msr, 1);
+}
+
+int kvm_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs)
+{
+ guard(srcu)(&vcpu->kvm->srcu);
+
+ return msr_io(vcpu, user_msrs, do_get_msr, 1);
+}
+
+int kvm_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs)
+{
+ guard(srcu)(&vcpu->kvm->srcu);
+
+ return msr_io(vcpu, user_msrs, do_set_msr, 0);
+}
+
+static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
+{
+ u64 val;
+
+ if (do_get_msr(vcpu, msr, &val))
+ return -EINVAL;
+
+ if (put_user(val, user_val))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
+{
+ u64 val;
+
+ if (get_user(val, user_val))
+ return -EFAULT;
+
+ if (do_set_msr(vcpu, msr, &val))
+ return -EINVAL;
+
+ return 0;
+}
+
+struct kvm_x86_reg_id {
+ __u32 index;
+ __u8 type;
+ __u8 rsvd1;
+ __u8 rsvd2:4;
+ __u8 size:4;
+ __u8 x86;
+};
+
+static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu,
+ struct kvm_x86_reg_id *reg)
+{
+ switch (reg->index) {
+ case KVM_REG_GUEST_SSP:
+ /*
+ * FIXME: If host-initiated accesses are ever exempted from
+ * ignore_msrs (in kvm_do_msr_access()), drop this manual check
+ * and rely on KVM's standard checks to reject accesses to regs
+ * that don't exist.
+ */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return -EINVAL;
+
+ reg->type = KVM_X86_REG_TYPE_MSR;
+ reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_one_reg one_reg;
+ struct kvm_x86_reg_id *reg;
+ u64 __user *user_val;
+ bool load_fpu;
+ int r;
+
+ if (copy_from_user(&one_reg, argp, sizeof(one_reg)))
+ return -EFAULT;
+
+ if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86)
+ return -EINVAL;
+
+ reg = (struct kvm_x86_reg_id *)&one_reg.id;
+ if (reg->rsvd1 || reg->rsvd2)
+ return -EINVAL;
+
+ if (reg->type == KVM_X86_REG_TYPE_KVM) {
+ r = kvm_translate_kvm_reg(vcpu, reg);
+ if (r)
+ return r;
+ }
+
+ if (reg->type != KVM_X86_REG_TYPE_MSR)
+ return -EINVAL;
+
+ if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64)
+ return -EINVAL;
+
+ guard(srcu)(&vcpu->kvm->srcu);
+
+ load_fpu = is_xstate_managed_msr(vcpu, reg->index);
+ if (load_fpu)
+ kvm_load_guest_fpu(vcpu);
+
+ user_val = u64_to_user_ptr(one_reg.addr);
+ if (ioctl == KVM_GET_ONE_REG)
+ r = kvm_get_one_msr(vcpu, reg->index, user_val);
+ else
+ r = kvm_set_one_msr(vcpu, reg->index, user_val);
+
+ if (load_fpu)
+ kvm_put_guest_fpu(vcpu);
+ return r;
+}
+
+int kvm_get_reg_list(struct kvm_vcpu *vcpu,
+ struct kvm_reg_list __user *user_list)
+{
+ u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0;
+ u64 user_nr_regs;
+
+ if (get_user(user_nr_regs, &user_list->n))
+ return -EFAULT;
+
+ if (put_user(nr_regs, &user_list->n))
+ return -EFAULT;
+
+ if (user_nr_regs < nr_regs)
+ return -E2BIG;
+
+ if (nr_regs &&
+ put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
+ return -EFAULT;
+
+ return 0;
+}
+
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc_obj(*msr_filter, GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
+{
+ u32 i;
+
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
+
+ kfree(msr_filter);
+}
+
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
+{
+ unsigned long *bitmap;
+ size_t bitmap_size;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ msr_filter->count++;
+ return 0;
+}
+
+int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_msr_filter *filter)
+{
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
+ bool default_allow;
+ bool empty = true;
+ int r;
+ u32 i;
+
+ if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
+ empty &= !filter->ranges[i].nmsrs;
+
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ if (empty && !default_allow)
+ return -EINVAL;
+
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
+ }
+
+ mutex_lock(&kvm->lock);
+ old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept accesses to
+ * MSRs that KVM would otherwise pass through to the guest.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS);
+
+ return 0;
+}
+
+
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+ u64 data;
+
+ if (kvm_get_feature_msr(NULL, msr_index, &data, true))
+ return;
+
+ msr_based_features[num_msr_based_features++] = msr_index;
+}
+
+static void kvm_probe_msr_to_save(u32 msr_index)
+{
+ u32 dummy[2];
+
+ if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
+ return;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed to guests in
+ * some cases.
+ */
+ switch (msr_index) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ return;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
+ return;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ return;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ return;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ return;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ return;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (msr_index - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
+ return;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ...
+ MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
+ if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ...
+ MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
+ if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ...
+ MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
+ if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
+ kvm_pmu_cap.num_counters_fixed)
+ return;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ return;
+ break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ return;
+ break;
+ case MSR_IA32_TSX_CTRL:
+ if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
+ return;
+ break;
+ case MSR_IA32_XSS:
+ if (!kvm_caps.supported_xss)
+ return;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ return;
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ if (!kvm_cpu_cap_has(X86_FEATURE_LM))
+ return;
+ fallthrough;
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ return;
+ break;
+ default:
+ break;
+ }
+
+ msrs_to_save[num_msrs_to_save++] = msr_index;
+}
+
+void kvm_init_msr_lists(void)
+{
+ unsigned i;
+
+ BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3,
+ "Please update the fixed PMCs in msrs_to_save_pmu[]");
+
+ num_msrs_to_save = 0;
+ num_emulated_msrs = 0;
+ num_msr_based_features = 0;
+
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
+ kvm_probe_msr_to_save(msrs_to_save_base[i]);
+
+ if (enable_pmu) {
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
+ kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+ if (!kvm_x86_call(has_emulated_msr)(NULL,
+ emulated_msrs_all[i]))
+ continue;
+
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
+ }
+
+ for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+ kvm_probe_feature_msr(i);
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+ kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
+}
+
+int kvm_spec_ctrl_test_value(u64 value)
+{
+ /*
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
+ */
+
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrq(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value);
diff --git a/arch/x86/kvm/msrs.h b/arch/x86/kvm/msrs.h
new file mode 100644
index 000000000000..b698983e37fb
--- /dev/null
+++ b/arch/x86/kvm/msrs.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_MSR_H
+#define ARCH_X86_KVM_MSR_H
+
+#include <linux/kvm_host.h>
+#include <linux/user-return-notifier.h>
+
+#include "cpuid.h"
+#include "regs.h"
+
+extern bool report_ignored_msrs;
+extern bool ignore_msrs;
+
+extern u32 __read_mostly kvm_nr_uret_msrs;
+
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
+/*
+ * The first...last VMX feature MSRs that are emulated by KVM. This may or may
+ * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
+ * associated feature that KVM supports for nested virtualization.
+ */
+#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
+#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
+
+/*
+ * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves
+ * are arbitrary and have no meaning, the only requirement is that they don't
+ * conflict with "real" MSRs that KVM supports. Use values at the upper end
+ * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values
+ * will be usable until KVM exhausts its supply of paravirtual MSR indices.
+ */
+#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff
+
+#define MSR_IA32_CR_PAT_DEFAULT \
+ PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC)
+
+void kvm_init_msr_lists(void);
+int kvm_get_msr_index_list(struct kvm_msr_list __user *user_msr_list);
+int kvm_get_feature_msr_index_list(struct kvm_msr_list __user *user_msr_list);
+int kvm_get_feature_msrs(struct kvm_msrs __user *user_msrs);
+
+int kvm_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs);
+int kvm_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs);
+
+int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl,
+ void __user *argp);
+int kvm_get_reg_list(struct kvm_vcpu *vcpu,
+ struct kvm_reg_list __user *user_list);
+
+void kvm_enable_efer_bits(u64);
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
+int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
+int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data);
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+
+fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
+
+int kvm_add_user_return_msr(u32 msr);
+int kvm_find_user_return_msr(u32 msr);
+int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
+u64 kvm_get_user_return_msr(unsigned int slot);
+
+static inline bool kvm_is_supported_user_return_msr(u32 msr)
+{
+ return kvm_find_user_return_msr(msr) >= 0;
+}
+
+void kvm_user_return_msr_cpu_online(void);
+void drop_user_return_notifiers(void);
+void kvm_destroy_user_return_msrs(void);
+
+int kvm_emulator_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index,
+ u64 *pdata);
+int kvm_emulator_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 msr_index,
+ u64 data);
+int kvm_emulator_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
+
+enum kvm_msr_access {
+ MSR_TYPE_R = BIT(0),
+ MSR_TYPE_W = BIT(1),
+ MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W,
+};
+
+/*
+ * Internal error codes that are used to indicate that MSR emulation encountered
+ * an error that should result in #GP in the guest, unless userspace handles it.
+ * Note, '1', '0', and negative numbers are off limits, as they are used by KVM
+ * as part of KVM's lightly documented internal KVM_RUN return codes.
+ *
+ * UNSUPPORTED - The MSR isn't supported, either because it is completely
+ * unknown to KVM, or because the MSR should not exist according
+ * to the vCPU model.
+ *
+ * FILTERED - Access to the MSR is denied by a userspace MSR filter.
+ */
+#define KVM_MSR_RET_UNSUPPORTED 2
+#define KVM_MSR_RET_FILTERED 3
+
+int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_msr_filter *filter);
+void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter);
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+
+u64 kvm_get_arch_capabilities(void);
+int kvm_spec_ctrl_test_value(u64 value);
+
+#define CET_US_RESERVED_BITS GENMASK(9, 6)
+#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0)
+#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10))
+#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12)
+
+static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (data & CET_US_RESERVED_BITS)
+ return false;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ (data & CET_US_SHSTK_MASK_BITS))
+ return false;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
+ (data & CET_US_IBT_MASK_BITS))
+ return false;
+ if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4))
+ return false;
+ /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */
+ if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR))
+ return false;
+
+ return true;
+}
+
+#endif
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 6f74e2b27c1e..c4ec024943bb 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -19,7 +19,7 @@
#include <asm/mtrr.h>
#include "cpuid.h"
-#include "x86.h"
+#include "msrs.h"
static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
{
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index dd1c57593f48..7f777049d328 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -43,6 +43,18 @@ module_param(enable_pmu, bool, 0444);
bool __read_mostly enable_mediated_pmu;
EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu);
+struct kvm_x86_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 nr_includes;
+ __u32 nr_excludes;
+ __u64 *includes;
+ __u64 *excludes;
+ __u64 events[] __counted_by(nevents);
+};
+
struct kvm_pmu_emulated_event_selectors {
u64 INSTRUCTIONS_RETIRED;
u64 BRANCH_INSTRUCTIONS_RETIRED;
diff --git a/arch/x86/kvm/regs.c b/arch/x86/kvm/regs.c
new file mode 100644
index 000000000000..bd8147798cc3
--- /dev/null
+++ b/arch/x86/kvm/regs.c
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+#include "lapic.h"
+#include "mmu.h"
+#include "regs.h"
+#include "x86.h"
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
+{
+ /* Can't read the RIP when guest state is protected, just return 0 */
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(kvm_get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip);
+
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ return kvm_get_linear_rip(vcpu) == linear_rip;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = kvm_x86_call(get_rflags)(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~X86_EFLAGS_TF;
+ return rflags;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags);
+
+void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
+ kvm_x86_call(set_rflags)(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags);
+
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Userspace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
+ regs->rax = kvm_rax_read_raw(vcpu);
+ regs->rbx = kvm_rbx_read_raw(vcpu);
+ regs->rcx = kvm_rcx_read_raw(vcpu);
+ regs->rdx = kvm_rdx_read_raw(vcpu);
+ regs->rsi = kvm_rsi_read_raw(vcpu);
+ regs->rdi = kvm_rdi_read_raw(vcpu);
+ regs->rsp = kvm_rsp_read(vcpu);
+ regs->rbp = kvm_rbp_read_raw(vcpu);
+#ifdef CONFIG_X86_64
+ regs->r8 = kvm_r8_read_raw(vcpu);
+ regs->r9 = kvm_r9_read_raw(vcpu);
+ regs->r10 = kvm_r10_read_raw(vcpu);
+ regs->r11 = kvm_r11_read_raw(vcpu);
+ regs->r12 = kvm_r12_read_raw(vcpu);
+ regs->r13 = kvm_r13_read_raw(vcpu);
+ regs->r14 = kvm_r14_read_raw(vcpu);
+ regs->r15 = kvm_r15_read_raw(vcpu);
+#endif
+
+ regs->rip = kvm_rip_read(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ kvm_rax_write_raw(vcpu, regs->rax);
+ kvm_rbx_write_raw(vcpu, regs->rbx);
+ kvm_rcx_write_raw(vcpu, regs->rcx);
+ kvm_rdx_write_raw(vcpu, regs->rdx);
+ kvm_rsi_write_raw(vcpu, regs->rsi);
+ kvm_rdi_write_raw(vcpu, regs->rdi);
+ kvm_rsp_write(vcpu, regs->rsp);
+ kvm_rbp_write_raw(vcpu, regs->rbp);
+#ifdef CONFIG_X86_64
+ kvm_r8_write_raw(vcpu, regs->r8);
+ kvm_r9_write_raw(vcpu, regs->r9);
+ kvm_r10_write_raw(vcpu, regs->r10);
+ kvm_r11_write_raw(vcpu, regs->r11);
+ kvm_r12_write_raw(vcpu, regs->r12);
+ kvm_r13_write_raw(vcpu, regs->r13);
+ kvm_r14_write_raw(vcpu, regs->r14);
+ kvm_r15_write_raw(vcpu, regs->r15);
+#endif
+
+ kvm_rip_write(vcpu, regs->rip);
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception_vmexit.pending = false;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
+}
+
+/*
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct kvm_pagewalk *w = &vcpu->arch.gva_walk;
+ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+ gpa_t real_gpa;
+ int i;
+ int ret;
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK |
+ PFERR_GUEST_PAGE_MASK, NULL, 0);
+ if (real_gpa == INVALID_GPA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
+ return 0;
+ }
+ }
+
+ /*
+ * Marking VCPU_REG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Shadow page roots need to be reconstructed instead.
+ */
+ if (!tdp_enabled && memcmp(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)))
+ kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.root_mmu,
+ KVM_MMU_ROOT_CURRENT);
+
+ memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+ vcpu->arch.pdptrs_from_userspace = false;
+
+ return 1;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs);
+
+static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return false;
+#endif
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return false;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return false;
+
+ return kvm_x86_call(is_valid_cr0)(vcpu, cr0);
+}
+
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
+{
+ /*
+ * CR0.WP is incorporated into the MMU role, but only for non-nested,
+ * indirect shadow MMUs. If paging is disabled, no updates are needed
+ * as there are no permission bits to emulate. If TDP is enabled, the
+ * MMU's metadata needs to be updated, e.g. so that emulating guest
+ * translations does the right thing, but there's no need to unload the
+ * root as CR0.WP doesn't affect SPTEs.
+ */
+ if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+ if (!(cr0 & X86_CR0_PG))
+ return;
+
+ if (tdp_enabled) {
+ kvm_init_mmu(vcpu);
+ return;
+ }
+ }
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ /*
+ * Clearing CR0.PG is defined to flush the TLB from the guest's
+ * perspective.
+ */
+ if (!(cr0 & X86_CR0_PG))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ /*
+ * Check for async #PF completion events when enabling paging,
+ * as the vCPU may have previously encountered async #PFs (it's
+ * entirely legal for the guest to toggle paging on/off without
+ * waiting for the async #PF queue to drain).
+ */
+ else if (kvm_pv_async_pf_enabled(vcpu))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ }
+
+ if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0);
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+
+ if (!kvm_is_valid_cr0(vcpu, cr0))
+ return 1;
+
+ cr0 |= X86_CR0_ET;
+
+ /* Write to CR0 reserved bits are ignored, even on Intel. */
+ cr0 &= ~CR0_RESERVED_BITS;
+
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+ (cr0 & X86_CR0_PG)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ if (cs_l)
+ return 1;
+ }
+#endif
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+ !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
+ return 1;
+
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
+ return 1;
+
+ if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET))
+ return 1;
+
+ kvm_x86_call(set_cr0)(vcpu, cr0);
+
+ kvm_post_set_cr0(vcpu, old_cr0, cr0);
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0);
+
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
+
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ bool skip_tlb_flush = false;
+ unsigned long pcid = 0;
+#ifdef CONFIG_X86_64
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
+ skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+ cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ pcid = cr3 & X86_CR3_PCID_MASK;
+ }
+#endif
+
+ /* PDPTRs are always reloaded for PAE paging. */
+ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ goto handle_tlb_flush;
+
+ /*
+ * Do not condition the GPA check on long mode, this helper is used to
+ * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
+ * the current vCPU mode is accurate.
+ */
+ if (!kvm_vcpu_is_legal_cr3(vcpu, cr3))
+ return 1;
+
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+ return 1;
+
+ if (cr3 != kvm_read_cr3(vcpu))
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ vcpu->arch.cr3 = cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
+ /* Do not call post_set_cr3, we do not get here for confidential guests. */
+
+handle_tlb_flush:
+ /*
+ * A load of CR3 that flushes the TLB flushes only the current PCID,
+ * even if PCID is disabled, in which case PCID=0 is flushed. It's a
+ * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ * and it's impossible to use a non-zero PCID when PCID is disabled,
+ * i.e. only PCID=0 can be relevant.
+ */
+ if (!skip_tlb_flush)
+ kvm_invalidate_pcid(vcpu, pcid);
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3);
+
+static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return __kvm_is_valid_cr4(vcpu, cr4) &&
+ kvm_x86_call(is_valid_cr4)(vcpu, cr4);
+}
+
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
+{
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
+ * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
+ * so fall through.
+ */
+ if (!tdp_enabled &&
+ (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
+ kvm_mmu_unload(vcpu);
+
+ /*
+ * The TLB has to be flushed for all PCIDs if any of the following
+ * (architecturally required) changes happen:
+ * - CR4.PCIDE is changed from 1 to 0
+ * - CR4.PGE is toggled
+ *
+ * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
+ */
+ if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+
+ /*
+ * The TLB has to be flushed for the current PCID if any of the
+ * following (architecturally required) changes happen:
+ * - CR4.SMEP is changed from 0 to 1
+ * - CR4.PAE is toggled
+ */
+ else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
+ ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4);
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+
+ if (!kvm_is_valid_cr4(vcpu, cr4))
+ return 1;
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+ && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
+ return 1;
+
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
+ }
+
+ if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP))
+ return 1;
+
+ kvm_x86_call(set_cr4)(vcpu, cr4);
+
+ kvm_post_set_cr4(vcpu, old_cr4, cr4);
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4);
+
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
+ if (lapic_in_kernel(vcpu))
+ kvm_lapic_set_tpr(vcpu, cr8);
+ else
+ vcpu->arch.cr8 = cr8;
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8);
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+ return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8);
+
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ struct desc_ptr dt;
+
+ if (vcpu->arch.guest_state_protected)
+ goto skip_protected_regs;
+
+ kvm_handle_exception_payload_quirk(vcpu);
+
+ kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_x86_call(get_idt)(vcpu, &dt);
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
+ kvm_x86_call(get_gdt)(vcpu, &dt);
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
+
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = kvm_read_cr3(vcpu);
+
+skip_protected_regs:
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr4 = kvm_read_cr4(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = vcpu->arch.apic_base;
+}
+
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ __get_sregs_common(vcpu, sregs);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+void kvm_vcpu_ioctl_x86_get_sregs2(struct kvm_vcpu *vcpu,
+ struct kvm_sregs2 *sregs2)
+{
+ int i;
+
+ __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ kvm_vcpu_srcu_read_lock(vcpu);
+ for (i = 0 ; i < 4 ; i++)
+ sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ }
+}
+
+static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
+ /*
+ * When EFER.LME and CR0.PG are set, the processor is in
+ * 64-bit mode (though maybe in a 32-bit code segment).
+ * CR4.PAE and EFER.LMA must be set.
+ */
+ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
+ return false;
+ if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
+ return false;
+ } else {
+ /*
+ * Not in 64-bit mode: EFER.LMA is clear and the code
+ * segment cannot be 64-bit.
+ */
+ if (sregs->efer & EFER_LMA || sregs->cs.l)
+ return false;
+ }
+
+ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ kvm_is_valid_cr0(vcpu, sregs->cr0);
+}
+
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ int *mmu_reset_needed, bool update_pdptrs)
+{
+ int idx;
+ struct desc_ptr dt;
+
+ if (!kvm_is_valid_sregs(vcpu, sregs))
+ return -EINVAL;
+
+ if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
+ return -EINVAL;
+
+ if (vcpu->arch.guest_state_protected)
+ return 0;
+
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
+ kvm_x86_call(set_gdt)(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+ kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
+ kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
+
+ *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ kvm_x86_call(set_efer)(vcpu, sregs->efer);
+
+ *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
+
+ *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
+
+ if (update_pdptrs) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ *mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ int pending_vec, max_bits;
+ int mmu_reset_needed = 0;
+ int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+ if (ret)
+ return ret;
+
+ if (mmu_reset_needed) {
+ kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
+
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int ret;
+
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_vcpu_ioctl_x86_set_sregs2(struct kvm_vcpu *vcpu,
+ struct kvm_sregs2 *sregs2)
+{
+ int mmu_reset_needed = 0;
+ bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ !(sregs2->efer & EFER_LMA);
+ int i, ret;
+
+ if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ return -EINVAL;
+
+ if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ return -EINVAL;
+
+ ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ &mmu_reset_needed, !valid_pdptrs);
+ if (ret)
+ return ret;
+
+ if (valid_pdptrs) {
+ for (i = 0; i < 4 ; i++)
+ kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+ kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
+ mmu_reset_needed = 1;
+ vcpu->arch.pdptrs_from_userspace = true;
+ }
+ if (mmu_reset_needed) {
+ kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
+ return 0;
+}
+
+void kvm_run_sync_regs_to_user(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+}
+
+int kvm_run_sync_regs_from_user(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
+
+ if (__set_sregs(vcpu, &sregs))
+ return -EINVAL;
+
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
+
+ return 0;
+}
+
+void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+}
+
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+ unsigned long dr7;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ kvm_x86_call(set_dr7)(vcpu, dr7);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7);
+
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))
+ fixed |= DR6_RTM;
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
+ return fixed;
+}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ case 6:
+ if (!kvm_dr6_valid(val))
+ return 1; /* #GP */
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
+ break;
+ case 5:
+ default: /* 7 */
+ if (!kvm_dr7_valid(val))
+ return 1; /* #GP */
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr);
+
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ return vcpu->arch.db[array_index_nospec(dr, size)];
+ case 4:
+ case 6:
+ return vcpu->arch.dr6;
+ case 5:
+ default: /* 7 */
+ return vcpu->arch.dr7;
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr);
+
+int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned int i;
+
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ kvm_handle_exception_payload_quirk(vcpu);
+
+ memset(dbgregs, 0, sizeof(*dbgregs));
+
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ dbgregs->db[i] = vcpu->arch.db[i];
+
+ dbgregs->dr6 = vcpu->arch.dr6;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ return 0;
+}
+
+int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned int i;
+
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ if (!kvm_dr6_valid(dbgregs->dr6))
+ return -EINVAL;
+ if (!kvm_dr7_valid(dbgregs->dr7))
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ vcpu->arch.db[i] = dbgregs->db[i];
+
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = dbgregs->dr6;
+ vcpu->arch.dr7 = dbgregs->dr7;
+ kvm_update_dr7(vcpu);
+
+ return 0;
+}
diff --git a/arch/x86/kvm/regs.h b/arch/x86/kvm/regs.h
index 5bda738afb7c..94fd86728fed 100644
--- a/arch/x86/kvm/regs.h
+++ b/arch/x86/kvm/regs.h
@@ -16,6 +16,18 @@
static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
+void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+
static inline bool is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
@@ -397,6 +409,14 @@ static inline bool kvm_dr6_valid(u64 data)
return !(data >> 32);
}
+static inline unsigned long kvm_get_effective_dr7(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ return vcpu->arch.guest_debug_dr7;
+
+ return vcpu->arch.dr7;
+}
+
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
@@ -420,4 +440,44 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
+static inline unsigned long kvm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return kvm_x86_call(get_segment_base)(vcpu, seg);
+}
+
+static inline void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_call(set_segment)(vcpu, var, seg);
+}
+
+static inline void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_call(get_segment)(vcpu, var, seg);
+}
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+void kvm_vcpu_ioctl_x86_get_sregs2(struct kvm_vcpu *vcpu,
+ struct kvm_sregs2 *sregs2);
+int kvm_vcpu_ioctl_x86_set_sregs2(struct kvm_vcpu *vcpu,
+ struct kvm_sregs2 *sregs2);
+
+void kvm_run_sync_regs_to_user(struct kvm_vcpu *vcpu);
+int kvm_run_sync_regs_from_user(struct kvm_vcpu *vcpu);
+
+void kvm_update_dr0123(struct kvm_vcpu *vcpu);
+void kvm_update_dr7(struct kvm_vcpu *vcpu);
+int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs);
+int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs);
+
+
#endif
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3e6c671a8dc2..ba985a02208a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -23,6 +23,7 @@
#include "kvm_emulate.h"
#include "trace.h"
+#include "irq.h"
#include "mmu.h"
#include "x86.h"
#include "smm.h"
@@ -112,16 +113,15 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
svm->vmcb01.ptr->save.efer,
svm->nested.ctl.nested_cr3,
svm->nested.ctl.misc_ctl);
- vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
- vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
- vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+
+ vcpu->arch.ngpa_walk.get_guest_pgd = nested_svm_get_tdp_cr3;
+ vcpu->arch.ngpa_walk.get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.ngpa_walk.inject_page_fault = nested_svm_inject_npf_exit;
}
static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
vcpu->arch.mmu = &vcpu->arch.root_mmu;
- vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
@@ -2150,7 +2150,7 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 pte_access)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_pagewalk *w = &vcpu->arch.ngpa_walk;
if (WARN_ON_ONCE(!mmu_is_nested(vcpu)))
return gpa;
@@ -2159,7 +2159,7 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET))
access |= PFERR_USER_MASK;
- return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+ return w->gva_to_gpa(vcpu, w, gpa, access, exception);
}
struct kvm_x86_nested_ops svm_nested_ops = {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 4d2bacd00ec4..ba4ac1d860fd 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4055,12 +4055,12 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (vcpu->arch.nested_run_pending)
- return -EBUSY;
-
if (svm_interrupt_blocked(vcpu))
return 0;
+ if (vcpu->arch.nested_run_pending)
+ return -EBUSY;
+
/*
* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
* e.g. if the IRQ arrived asynchronously after checking nested events.
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
index 3f9150125e70..117bf8bec07d 100644
--- a/arch/x86/kvm/tss.h
+++ b/arch/x86/kvm/tss.h
@@ -57,4 +57,11 @@ struct tss_segment_16 {
u16 ldt;
};
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE \
+ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
#endif
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6957bb6f5cf7..0635e92471c8 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -11,6 +11,7 @@
#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "irq.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
@@ -407,7 +408,7 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
if (roots)
- kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
+ kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.ngpa_walk, addr, roots);
}
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -511,17 +512,15 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
nested_ept_new_eptp(vcpu);
- vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
- vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
- vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
+ vcpu->arch.ngpa_walk.get_guest_pgd = nested_ept_get_eptp;
+ vcpu->arch.ngpa_walk.get_pdptr = kvm_pdptr_read;
- vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+ vcpu->arch.ngpa_walk.inject_page_fault = nested_ept_inject_page_fault;
}
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
vcpu->arch.mmu = &vcpu->arch.root_mmu;
- vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -7463,12 +7462,13 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
return 0;
}
+
static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 access,
struct x86_exception *exception,
u64 pte_access)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_pagewalk *w = &vcpu->arch.ngpa_walk;
if (WARN_ON_ONCE(!mmu_is_nested(vcpu)))
return gpa;
@@ -7481,7 +7481,7 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
access |= PFERR_USER_MASK;
- return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+ return w->gva_to_gpa(vcpu, w, gpa, access, exception);
}
struct kvm_x86_nested_ops vmx_nested_ops = {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cc75feec05da..a03add00f923 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -72,6 +72,7 @@
#include "x86.h"
#include "x86_ops.h"
#include "smm.h"
+#include "tss.h"
#include "vmx_onhyperv.h"
#include "vmenter.h"
#include "posted_intr.h"
@@ -1186,6 +1187,18 @@ static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT);
}
+static u16 vmx_store_ldt(void)
+{
+ u16 ldt;
+ asm("sldt %0" : "=g"(ldt));
+ return ldt;
+}
+
+static void vmx_load_ldt(u16 sel)
+{
+ asm("lldt %0" : : "rm"(sel));
+}
+
#ifdef CONFIG_X86_32
/*
* On 32-bit kernels, VM exits still load the FS and GS bases from the
@@ -1203,7 +1216,7 @@ static unsigned long segment_base(u16 selector)
table = get_current_gdt_ro();
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
- u16 ldt_selector = kvm_read_ldt();
+ u16 ldt_selector = vmx_store_ldt();
if (!(ldt_selector & ~SEGMENT_RPL_MASK))
return 0;
@@ -1358,7 +1371,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
- host_state->ldt_sel = kvm_read_ldt();
+ host_state->ldt_sel = vmx_store_ldt();
#ifdef CONFIG_X86_64
savesegment(ds, host_state->ds_sel);
@@ -1405,7 +1418,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
- kvm_load_ldt(host_state->ldt_sel);
+ vmx_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
load_gs_index(host_state->gs_sel);
#else
@@ -5238,6 +5251,9 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
+ if (vmx_interrupt_blocked(vcpu))
+ return 0;
+
if (vcpu->arch.nested_run_pending)
return -EBUSY;
@@ -5248,7 +5264,7 @@ int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return -EBUSY;
- return !vmx_interrupt_blocked(vcpu);
+ return 1;
}
int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -8703,7 +8719,7 @@ __init int vmx_hardware_setup(void)
/*
* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
- * bits to shadow_zero_check.
+ * bits into the MMU's struct kvm_page_format.
*/
vmx_setup_me_spte_mask();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index afcac1042947..0626e835e9eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -80,7 +80,6 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
-#include <asm/intel_pt.h>
#include <asm/emulate_prefix.h>
#include <asm/sgx.h>
#include <asm/virt.h>
@@ -90,8 +89,6 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
-#define MAX_IO_MSRS 256
-
/*
* Note, kvm_caps fields should *never* have default values, all fields must be
* recomputed from scratch during vendor module load, e.g. to account for a
@@ -108,17 +105,12 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host);
#define emul_to_vcpu(ctxt) \
((struct kvm_vcpu *)(ctxt)->vcpu)
-/* EFER defaults:
- * - enable syscall per default because its emulated by KVM
- * - enable LME and LMA per default on 64 bit KVM
+/*
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
*/
-#ifdef CONFIG_X86_64
-static
-u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
-#else
-static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
-#endif
-
+#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
@@ -128,17 +120,38 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \
KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)
+#define KVM_CLOCK_VALID_FLAGS \
+ (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+#define KVM_X86_VALID_QUIRKS \
+ (KVM_X86_QUIRK_LINT0_REENABLED | \
+ KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
+ KVM_X86_QUIRK_OUT_7E_INC_RIP | \
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
+ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
+ KVM_X86_QUIRK_SLOT_ZAP_ALL | \
+ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT | \
+ KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM | \
+ KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT)
+
+#define KVM_X86_CONDITIONAL_QUIRKS \
+ (KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
+
+#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
+ KVM_BUS_LOCK_DETECTION_EXIT)
+
+#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \
+ KVM_X86_NOTIFY_VMEXIT_USER)
+
static void process_nmi(struct kvm_vcpu *vcpu);
-static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
-static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
-static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
-
static DEFINE_MUTEX(vendor_module_lock);
-static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
@@ -152,13 +165,6 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl);
-static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, 0644);
-
-bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, 0644);
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
-
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, 0644);
@@ -181,34 +187,10 @@ module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, 0644);
-bool __read_mostly eager_page_split = true;
-module_param(eager_page_split, bool, 0644);
-
/* Enable/disable SMT_RSB bug mitigation */
static bool __read_mostly mitigate_smt_rsb;
module_param(mitigate_smt_rsb, bool, 0444);
-/*
- * Restoring the host value for MSRs that are only consumed when running in
- * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
- * returns to userspace, i.e. the kernel can run with the guest's value.
- */
-#define KVM_MAX_NR_USER_RETURN_MSRS 16
-
-struct kvm_user_return_msrs {
- struct user_return_notifier urn;
- bool registered;
- struct kvm_user_return_msr_values {
- u64 host;
- u64 curr;
- } values[KVM_MAX_NR_USER_RETURN_MSRS];
-};
-
-u32 __read_mostly kvm_nr_uret_msrs;
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
-static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
-static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
-
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
@@ -311,249 +293,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
static struct kmem_cache *x86_emulator_cache;
-/*
- * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
- * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
- * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
- * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
- * MSRs that KVM emulates without strictly requiring host support.
- * msr_based_features holds MSRs that enumerate features, i.e. are effectively
- * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
- * msrs_to_save and emulated_msrs.
- */
-
-static const u32 msrs_to_save_base[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
- MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
- MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
- MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
- MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
- MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
- MSR_IA32_UMWAIT_CONTROL,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS,
-
- MSR_IA32_U_CET, MSR_IA32_S_CET,
- MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
- MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
- MSR_IA32_DEBUGCTLMSR,
- MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP,
- MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP,
-};
-
-static const u32 msrs_to_save_pmu[] = {
- MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
- MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- MSR_CORE_PERF_GLOBAL_CTRL,
- MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
-
- /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
- MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
- MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
- MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
- MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
- MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
- MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
-
- MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
- MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
-
- /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
- MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
- MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
- MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
- MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
-};
-
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
- ARRAY_SIZE(msrs_to_save_pmu)];
-static unsigned num_msrs_to_save;
-
-static const u32 emulated_msrs_all[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-
-#ifdef CONFIG_KVM_HYPERV
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
- HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
- HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
- HV_X64_MSR_RESET,
- HV_X64_MSR_VP_INDEX,
- HV_X64_MSR_VP_RUNTIME,
- HV_X64_MSR_SCONTROL,
- HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_VP_ASSIST_PAGE,
- HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
- HV_X64_MSR_SYNDBG_OPTIONS,
- HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
- HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
- HV_X64_MSR_SYNDBG_PENDING_BUFFER,
-#endif
-
- MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
-
- MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSC_DEADLINE,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
- MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
- MSR_IA32_MCG_EXT_CTL,
- MSR_IA32_SMBASE,
- MSR_SMI_COUNT,
- MSR_PLATFORM_INFO,
- MSR_MISC_FEATURES_ENABLES,
- MSR_AMD64_VIRT_SPEC_CTRL,
- MSR_AMD64_TSC_RATIO,
- MSR_IA32_POWER_CTL,
- MSR_IA32_UCODE_REV,
-
- /*
- * KVM always supports the "true" VMX control MSRs, even if the host
- * does not. The VMX MSRs as a whole are considered "emulated" as KVM
- * doesn't strictly require them to exist in the host (ignoring that
- * KVM would refuse to load in the first place if the core set of MSRs
- * aren't supported).
- */
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
- MSR_K7_HWCR,
- MSR_KVM_POLL_CONTROL,
-};
-
-static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
-static unsigned num_emulated_msrs;
-
-/*
- * List of MSRs that control the existence of MSR-based features, i.e. MSRs
- * that are effectively CPUID leafs. VMX MSRs are also included in the set of
- * feature MSRs, but are handled separately to allow expedited lookups.
- */
-static const u32 msr_based_features_all_except_vmx[] = {
- MSR_AMD64_DE_CFG,
- MSR_IA32_UCODE_REV,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
- MSR_PLATFORM_INFO,
-};
-
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
- (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
-static unsigned int num_msr_based_features;
-
-/*
- * All feature MSRs except uCode revID, which tracks the currently loaded uCode
- * patch, are immutable once the vCPU model is defined.
- */
-static bool kvm_is_immutable_feature_msr(u32 msr)
-{
- int i;
-
- if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
- return true;
-
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
- if (msr == msr_based_features_all_except_vmx[i])
- return msr != MSR_IA32_UCODE_REV;
- }
-
- return false;
-}
-
-static bool kvm_is_advertised_msr(u32 msr_index)
-{
- unsigned int i;
-
- for (i = 0; i < num_msrs_to_save; i++) {
- if (msrs_to_save[i] == msr_index)
- return true;
- }
-
- for (i = 0; i < num_emulated_msrs; i++) {
- if (emulated_msrs[i] == msr_index)
- return true;
- }
-
- return false;
-}
-
-typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated);
-
-static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
- u64 *data, bool host_initiated,
- enum kvm_msr_access rw,
- msr_access_t msr_access_fn)
-{
- const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
- int ret;
-
- BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
-
- /*
- * Zero the data on read failures to avoid leaking stack data to the
- * guest and/or userspace, e.g. if the failure is ignored below.
- */
- ret = msr_access_fn(vcpu, msr, data, host_initiated);
- if (ret && rw == MSR_TYPE_R)
- *data = 0;
-
- if (ret != KVM_MSR_RET_UNSUPPORTED)
- return ret;
-
- /*
- * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
- * advertises to userspace, even if an MSR isn't fully supported.
- * Simply check that @data is '0', which covers both the write '0' case
- * and all reads (in which case @data is zeroed on failure; see above).
- */
- if (host_initiated && !*data && kvm_is_advertised_msr(msr))
- return 0;
-
- if (!ignore_msrs) {
- kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
- op, msr, *data);
- return ret;
- }
-
- if (report_ignored_msrs)
- kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
-
- return 0;
-}
-
static struct kmem_cache *kvm_alloc_emulator_cache(void)
{
unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
@@ -567,128 +306,6 @@ static struct kmem_cache *kvm_alloc_emulator_cache(void)
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
-static void kvm_destroy_user_return_msrs(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
-
- kvm_nr_uret_msrs = 0;
-}
-
-static void kvm_on_user_return(struct user_return_notifier *urn)
-{
- unsigned slot;
- struct kvm_user_return_msrs *msrs
- = container_of(urn, struct kvm_user_return_msrs, urn);
- struct kvm_user_return_msr_values *values;
-
- msrs->registered = false;
- user_return_notifier_unregister(urn);
-
- for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
- values = &msrs->values[slot];
- if (values->host != values->curr) {
- wrmsrq(kvm_uret_msrs_list[slot], values->host);
- values->curr = values->host;
- }
- }
-}
-
-static int kvm_probe_user_return_msr(u32 msr)
-{
- u64 val;
- int ret;
-
- preempt_disable();
- ret = rdmsrq_safe(msr, &val);
- if (ret)
- goto out;
- ret = wrmsrq_safe(msr, val);
-out:
- preempt_enable();
- return ret;
-}
-
-int kvm_add_user_return_msr(u32 msr)
-{
- BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
-
- if (kvm_probe_user_return_msr(msr))
- return -1;
-
- kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
- return kvm_nr_uret_msrs++;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr);
-
-int kvm_find_user_return_msr(u32 msr)
-{
- int i;
-
- for (i = 0; i < kvm_nr_uret_msrs; ++i) {
- if (kvm_uret_msrs_list[i] == msr)
- return i;
- }
- return -1;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
-
-static void kvm_user_return_msr_cpu_online(void)
-{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
- u64 value;
- int i;
-
- for (i = 0; i < kvm_nr_uret_msrs; ++i) {
- rdmsrq_safe(kvm_uret_msrs_list[i], &value);
- msrs->values[i].host = value;
- msrs->values[i].curr = value;
- }
-}
-
-static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
-{
- if (!msrs->registered) {
- msrs->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&msrs->urn);
- msrs->registered = true;
- }
-}
-
-int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
-{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
- int err;
-
- value = (value & mask) | (msrs->values[slot].host & ~mask);
- if (value == msrs->values[slot].curr)
- return 0;
- err = wrmsrq_safe(kvm_uret_msrs_list[slot], value);
- if (err)
- return 1;
-
- msrs->values[slot].curr = value;
- kvm_user_return_register_notifier(msrs);
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
-
-u64 kvm_get_user_return_msr(unsigned int slot)
-{
- return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
-
-static void drop_user_return_notifiers(void)
-{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
-
- if (msrs->registered)
- kvm_on_user_return(&msrs->urn);
-}
-
/*
* Handle a fault on a hardware virtualization (VMX or SVM) instruction.
*
@@ -943,17 +560,6 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp);
-static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
- EMULTYPE_COMPLETE_USER_EXIT);
-}
-
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault,
bool from_hardware)
{
@@ -976,11 +582,12 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault,
bool from_hardware)
{
- struct kvm_mmu *fault_mmu;
+ struct kvm_pagewalk *fault_walk;
+
WARN_ON_ONCE(fault->vector != PF_VECTOR);
- fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
- vcpu->arch.walk_mmu;
+ fault_walk = fault->nested_page_fault ? &vcpu->arch.ngpa_walk :
+ &vcpu->arch.gva_walk;
/*
* Invalidate the TLB entry for the faulting address, if it exists,
@@ -988,10 +595,10 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
*/
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
- kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+ kvm_mmu_invalidate_addr(vcpu, fault_walk, fault->address,
KVM_MMU_ROOT_CURRENT);
- fault_mmu->inject_page_fault(vcpu, fault, from_hardware);
+ fault_walk->inject_page_fault(vcpu, fault, from_hardware);
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_inject_emulated_page_fault);
@@ -1017,170 +624,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr);
-static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
-}
-
-/*
- * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
- */
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- gpa_t real_gpa;
- int i;
- int ret;
- u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
-
- /*
- * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
- * to an L1 GPA.
- */
- real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
- PFERR_USER_MASK | PFERR_WRITE_MASK |
- PFERR_GUEST_PAGE_MASK, NULL, 0);
- if (real_gpa == INVALID_GPA)
- return 0;
-
- /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
- cr3 & GENMASK(11, 5), sizeof(pdpte));
- if (ret < 0)
- return 0;
-
- for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if ((pdpte[i] & PT_PRESENT_MASK) &&
- (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
- return 0;
- }
- }
-
- /*
- * Marking VCPU_REG_PDPTR dirty doesn't work for !tdp_enabled.
- * Shadow page roots need to be reconstructed instead.
- */
- if (!tdp_enabled && memcmp(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)))
- kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
-
- memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
- kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
- kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
- vcpu->arch.pdptrs_from_userspace = false;
-
- return 1;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs);
-
-static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
- return false;
-#endif
-
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return false;
-
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return false;
-
- return kvm_x86_call(is_valid_cr0)(vcpu, cr0);
-}
-
-void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
-{
- /*
- * CR0.WP is incorporated into the MMU role, but only for non-nested,
- * indirect shadow MMUs. If paging is disabled, no updates are needed
- * as there are no permission bits to emulate. If TDP is enabled, the
- * MMU's metadata needs to be updated, e.g. so that emulating guest
- * translations does the right thing, but there's no need to unload the
- * root as CR0.WP doesn't affect SPTEs.
- */
- if ((cr0 ^ old_cr0) == X86_CR0_WP) {
- if (!(cr0 & X86_CR0_PG))
- return;
-
- if (tdp_enabled) {
- kvm_init_mmu(vcpu);
- return;
- }
- }
-
- if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- /*
- * Clearing CR0.PG is defined to flush the TLB from the guest's
- * perspective.
- */
- if (!(cr0 & X86_CR0_PG))
- kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
- /*
- * Check for async #PF completion events when enabling paging,
- * as the vCPU may have previously encountered async #PFs (it's
- * entirely legal for the guest to toggle paging on/off without
- * waiting for the async #PF queue to drain).
- */
- else if (kvm_pv_async_pf_enabled(vcpu))
- kvm_make_request(KVM_REQ_APF_READY, vcpu);
- }
-
- if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
- kvm_mmu_reset_context(vcpu);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0);
-
-int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- unsigned long old_cr0 = kvm_read_cr0(vcpu);
-
- if (!kvm_is_valid_cr0(vcpu, cr0))
- return 1;
-
- cr0 |= X86_CR0_ET;
-
- /* Write to CR0 reserved bits are ignored, even on Intel. */
- cr0 &= ~CR0_RESERVED_BITS;
-
-#ifdef CONFIG_X86_64
- if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
- (cr0 & X86_CR0_PG)) {
- int cs_db, cs_l;
-
- if (!is_pae(vcpu))
- return 1;
- kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
- }
-#endif
- if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
- is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
- !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
- return 1;
-
- if (!(cr0 & X86_CR0_PG) &&
- (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
- return 1;
-
- if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET))
- return 1;
-
- kvm_x86_call(set_cr0)(vcpu, cr0);
-
- kvm_post_set_cr0(vcpu, old_cr0, cr0);
-
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0);
-
-void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
-{
- (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
-
static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest)
{
if (vcpu->arch.guest_state_protected)
@@ -1224,13 +667,6 @@ static void kvm_load_host_pkru(struct kvm_vcpu *vcpu)
}
}
-#ifdef CONFIG_X86_64
-static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
-}
-#endif
-
int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
@@ -1290,89 +726,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv);
-static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- return __kvm_is_valid_cr4(vcpu, cr4) &&
- kvm_x86_call(is_valid_cr4)(vcpu, cr4);
-}
-
-void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
-{
- if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
- kvm_mmu_reset_context(vcpu);
-
- /*
- * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
- * according to the SDM; however, stale prev_roots could be reused
- * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
- * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
- * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
- * so fall through.
- */
- if (!tdp_enabled &&
- (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
- kvm_mmu_unload(vcpu);
-
- /*
- * The TLB has to be flushed for all PCIDs if any of the following
- * (architecturally required) changes happen:
- * - CR4.PCIDE is changed from 1 to 0
- * - CR4.PGE is toggled
- *
- * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
- */
- if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
- (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
- kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
-
- /*
- * The TLB has to be flushed for the current PCID if any of the
- * following (architecturally required) changes happen:
- * - CR4.SMEP is changed from 0 to 1
- * - CR4.PAE is toggled
- */
- else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
- ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4);
-
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
-
- if (!kvm_is_valid_cr4(vcpu, cr4))
- return 1;
-
- if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE))
- return 1;
- if ((cr4 ^ old_cr4) & X86_CR4_LA57)
- return 1;
- } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
- && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
- && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
- return 1;
-
- if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
- /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
- return 1;
- }
-
- if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP))
- return 1;
-
- kvm_x86_call(set_cr4)(vcpu, cr4);
-
- kvm_post_set_cr4(vcpu, old_cr4, cr4);
-
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4);
-
-static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
unsigned long roots_to_free = 0;
@@ -1415,167 +769,6 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
}
-int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
- bool skip_tlb_flush = false;
- unsigned long pcid = 0;
-#ifdef CONFIG_X86_64
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
- skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
- cr3 &= ~X86_CR3_PCID_NOFLUSH;
- pcid = cr3 & X86_CR3_PCID_MASK;
- }
-#endif
-
- /* PDPTRs are always reloaded for PAE paging. */
- if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
- goto handle_tlb_flush;
-
- /*
- * Do not condition the GPA check on long mode, this helper is used to
- * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
- * the current vCPU mode is accurate.
- */
- if (!kvm_vcpu_is_legal_cr3(vcpu, cr3))
- return 1;
-
- if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
- return 1;
-
- if (cr3 != kvm_read_cr3(vcpu))
- kvm_mmu_new_pgd(vcpu, cr3);
-
- vcpu->arch.cr3 = cr3;
- kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
- /* Do not call post_set_cr3, we do not get here for confidential guests. */
-
-handle_tlb_flush:
- /*
- * A load of CR3 that flushes the TLB flushes only the current PCID,
- * even if PCID is disabled, in which case PCID=0 is flushed. It's a
- * moot point in the end because _disabling_ PCID will flush all PCIDs,
- * and it's impossible to use a non-zero PCID when PCID is disabled,
- * i.e. only PCID=0 can be relevant.
- */
- if (!skip_tlb_flush)
- kvm_invalidate_pcid(vcpu, pcid);
-
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3);
-
-int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (cr8 & CR8_RESERVED_BITS)
- return 1;
- if (lapic_in_kernel(vcpu))
- kvm_lapic_set_tpr(vcpu, cr8);
- else
- vcpu->arch.cr8 = cr8;
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8);
-
-unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
-{
- if (lapic_in_kernel(vcpu))
- return kvm_lapic_get_cr8(vcpu);
- else
- return vcpu->arch.cr8;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8);
-
-static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
-{
- int i;
-
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- for (i = 0; i < KVM_NR_DB_REGS; i++)
- vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- }
-}
-
-void kvm_update_dr7(struct kvm_vcpu *vcpu)
-{
- unsigned long dr7;
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- dr7 = vcpu->arch.guest_debug_dr7;
- else
- dr7 = vcpu->arch.dr7;
- kvm_x86_call(set_dr7)(vcpu, dr7);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
- if (dr7 & DR7_BP_EN_MASK)
- vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7);
-
-static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
-{
- u64 fixed = DR6_FIXED_1;
-
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))
- fixed |= DR6_RTM;
-
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
- fixed |= DR6_BUS_LOCK;
- return fixed;
-}
-
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- size_t size = ARRAY_SIZE(vcpu->arch.db);
-
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[array_index_nospec(dr, size)] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4:
- case 6:
- if (!kvm_dr6_valid(val))
- return 1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
- break;
- case 5:
- default: /* 7 */
- if (!kvm_dr7_valid(val))
- return 1; /* #GP */
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- kvm_update_dr7(vcpu);
- break;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr);
-
-unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
-{
- size_t size = ARRAY_SIZE(vcpu->arch.db);
-
- switch (dr) {
- case 0 ... 3:
- return vcpu->arch.db[array_index_nospec(dr, size)];
- case 4:
- case 6:
- return vcpu->arch.dr6;
- case 5:
- default: /* 7 */
- return vcpu->arch.dr7;
- }
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr);
-
-static unsigned long kvm_get_effective_dr7(struct kvm_vcpu *vcpu)
-{
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- return vcpu->arch.guest_debug_dr7;
-
- return vcpu->arch.dr7;
-}
-
int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
u32 pmc = kvm_ecx_read(vcpu);
@@ -1592,595 +785,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc);
-/*
- * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
- * does not yet virtualize. These include:
- * 10 - MISC_PACKAGE_CTRLS
- * 11 - ENERGY_FILTERING_CTL
- * 12 - DOITM
- * 18 - FB_CLEAR_CTRL
- * 21 - XAPIC_DISABLE_STATUS
- * 23 - OVERCLOCKING_STATUS
- */
-
-#define KVM_SUPPORTED_ARCH_CAP \
- (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
- ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
- ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
- ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
- ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
-
-static u64 kvm_get_arch_capabilities(void)
-{
- u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
-
- /*
- * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
- * the nested hypervisor runs with NX huge pages. If it is not,
- * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
- * L1 guests, so it need not worry about its own (L2) guests.
- */
- data |= ARCH_CAP_PSCHANGE_MC_NO;
-
- /*
- * If we're doing cache flushes (either "always" or "cond")
- * we will do one whenever the guest does a vmlaunch/vmresume.
- * If an outer hypervisor is doing the cache flush for us
- * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
- * capability to the guest too, and if EPT is disabled we're not
- * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
- * require a nested hypervisor to do a flush of its own.
- */
- if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
- data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
-
- if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
- data |= ARCH_CAP_RDCL_NO;
- if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- data |= ARCH_CAP_SSB_NO;
- if (!boot_cpu_has_bug(X86_BUG_MDS))
- data |= ARCH_CAP_MDS_NO;
- if (!boot_cpu_has_bug(X86_BUG_RFDS))
- data |= ARCH_CAP_RFDS_NO;
- if (!boot_cpu_has_bug(X86_BUG_ITS))
- data |= ARCH_CAP_ITS_NO;
-
- if (!boot_cpu_has(X86_FEATURE_RTM)) {
- /*
- * If RTM=0 because the kernel has disabled TSX, the host might
- * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
- * and therefore knows that there cannot be TAA) but keep
- * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
- * and we want to allow migrating those guests to tsx=off hosts.
- */
- data &= ~ARCH_CAP_TAA_NO;
- } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
- data |= ARCH_CAP_TAA_NO;
- } else {
- /*
- * Nothing to do here; we emulate TSX_CTRL if present on the
- * host so the guest can choose between disabling TSX or
- * using VERW to clear CPU buffers.
- */
- }
-
- if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
- data |= ARCH_CAP_GDS_NO;
-
- return data;
-}
-
-static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated)
-{
- WARN_ON_ONCE(!host_initiated);
-
- switch (index) {
- case MSR_IA32_ARCH_CAPABILITIES:
- *data = kvm_get_arch_capabilities();
- break;
- case MSR_IA32_PERF_CAPABILITIES:
- *data = kvm_caps.supported_perf_cap;
- break;
- case MSR_PLATFORM_INFO:
- *data = MSR_PLATFORM_INFO_CPUID_FAULT;
- break;
- case MSR_IA32_UCODE_REV:
- rdmsrq_safe(index, data);
- break;
- default:
- return kvm_x86_call(get_feature_msr)(index, data);
- }
- return 0;
-}
-
-static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
- return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
- kvm_get_feature_msr);
-}
-
-static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS))
- return false;
-
- if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT))
- return false;
-
- if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
- return false;
-
- if (efer & (EFER_LME | EFER_LMA) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
- return false;
-
- if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX))
- return false;
-
- return true;
-
-}
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (efer & efer_reserved_bits)
- return false;
-
- return __kvm_valid_efer(vcpu, efer);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer);
-
-static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- u64 old_efer = vcpu->arch.efer;
- u64 efer = msr_info->data;
- int r;
-
- if (efer & efer_reserved_bits)
- return 1;
-
- if (!msr_info->host_initiated) {
- if (!__kvm_valid_efer(vcpu, efer))
- return 1;
-
- if (is_paging(vcpu) &&
- (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
- return 1;
- }
-
- efer &= ~EFER_LMA;
- efer |= vcpu->arch.efer & EFER_LMA;
-
- r = kvm_x86_call(set_efer)(vcpu, efer);
- if (r) {
- WARN_ON(r > 0);
- return r;
- }
-
- if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
- kvm_mmu_reset_context(vcpu);
-
- if (!static_cpu_has(X86_FEATURE_XSAVES) &&
- (efer & EFER_SVME))
- kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
-
- return 0;
-}
-
-void kvm_enable_efer_bits(u64 mask)
-{
- efer_reserved_bits &= ~mask;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits);
-
-bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
-{
- struct kvm_x86_msr_filter *msr_filter;
- struct msr_bitmap_range *ranges;
- struct kvm *kvm = vcpu->kvm;
- bool allowed;
- int idx;
- u32 i;
-
- /* x2APIC MSRs do not support filtering. */
- if (index >= 0x800 && index <= 0x8ff)
- return true;
-
- idx = srcu_read_lock(&kvm->srcu);
-
- msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
- if (!msr_filter) {
- allowed = true;
- goto out;
- }
-
- allowed = msr_filter->default_allow;
- ranges = msr_filter->ranges;
-
- for (i = 0; i < msr_filter->count; i++) {
- u32 start = ranges[i].base;
- u32 end = start + ranges[i].nmsrs;
- u32 flags = ranges[i].flags;
- unsigned long *bitmap = ranges[i].bitmap;
-
- if ((index >= start) && (index < end) && (flags & type)) {
- allowed = test_bit(index - start, bitmap);
- break;
- }
- }
-
-out:
- srcu_read_unlock(&kvm->srcu, idx);
-
- return allowed;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed);
-
-/*
- * Write @data into the MSR specified by @index. Select MSR specific fault
- * checks are bypassed if @host_initiated is %true.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
- bool host_initiated)
-{
- struct msr_data msr;
-
- switch (index) {
- case MSR_FS_BASE:
- case MSR_GS_BASE:
- case MSR_KERNEL_GS_BASE:
- case MSR_CSTAR:
- case MSR_LSTAR:
- if (is_noncanonical_msr_address(data, vcpu))
- return 1;
- break;
- case MSR_IA32_SYSENTER_EIP:
- case MSR_IA32_SYSENTER_ESP:
- /*
- * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
- * non-canonical address is written on Intel but not on
- * AMD (which ignores the top 32-bits, because it does
- * not implement 64-bit SYSENTER).
- *
- * 64-bit code should hence be able to write a non-canonical
- * value on AMD. Making the address canonical ensures that
- * vmentry does not fail on Intel after writing a non-canonical
- * value, and that something deterministic happens if the guest
- * invokes 64-bit SYSENTER.
- */
- data = __canonical_address(data, max_host_virt_addr_bits());
- break;
- case MSR_TSC_AUX:
- if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
- return 1;
-
- if (!host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
- return 1;
-
- /*
- * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
- * incomplete and conflicting architectural behavior. Current
- * AMD CPUs completely ignore bits 63:32, i.e. they aren't
- * reserved and always read as zeros. Enforce Intel's reserved
- * bits check if the guest CPU is Intel compatible, otherwise
- * clear the bits. This ensures cross-vendor migration will
- * provide consistent behavior for the guest.
- */
- if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
- return 1;
-
- data = (u32)data;
- break;
- case MSR_IA32_U_CET:
- case MSR_IA32_S_CET:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
- return KVM_MSR_RET_UNSUPPORTED;
- if (!kvm_is_valid_u_s_cet(vcpu, data))
- return 1;
- break;
- case MSR_KVM_INTERNAL_GUEST_SSP:
- if (!host_initiated)
- return 1;
- fallthrough;
- /*
- * Note that the MSR emulation here is flawed when a vCPU
- * doesn't support the Intel 64 architecture. The expected
- * architectural behavior in this case is that the upper 32
- * bits do not exist and should always read '0'. However,
- * because the actual hardware on which the virtual CPU is
- * running does support Intel 64, XRSTORS/XSAVES in the
- * guest could observe behavior that violates the
- * architecture. Intercepting XRSTORS/XSAVES for this
- * special case isn't deemed worthwhile.
- */
- case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
- return KVM_MSR_RET_UNSUPPORTED;
- /*
- * MSR_IA32_INT_SSP_TAB is not present on processors that do
- * not support Intel 64 architecture.
- */
- if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
- return KVM_MSR_RET_UNSUPPORTED;
- if (is_noncanonical_msr_address(data, vcpu))
- return 1;
- /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
- if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4))
- return 1;
- break;
- }
-
- msr.data = data;
- msr.index = index;
- msr.host_initiated = host_initiated;
-
- return kvm_x86_call(set_msr)(vcpu, &msr);
-}
-
-static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated)
-{
- return __kvm_set_msr(vcpu, index, *data, host_initiated);
-}
-
-static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
- u32 index, u64 data, bool host_initiated)
-{
- return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
- _kvm_set_msr);
-}
-
-/*
- * Read the MSR specified by @index into @data. Select MSR specific fault
- * checks are bypassed if @host_initiated is %true.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated)
-{
- struct msr_data msr;
- int ret;
-
- switch (index) {
- case MSR_TSC_AUX:
- if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
- return 1;
-
- if (!host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
- return 1;
- break;
- case MSR_IA32_U_CET:
- case MSR_IA32_S_CET:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
- return KVM_MSR_RET_UNSUPPORTED;
- break;
- case MSR_KVM_INTERNAL_GUEST_SSP:
- if (!host_initiated)
- return 1;
- fallthrough;
- case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
- return KVM_MSR_RET_UNSUPPORTED;
- break;
- }
-
- msr.index = index;
- msr.host_initiated = host_initiated;
-
- ret = kvm_x86_call(get_msr)(vcpu, &msr);
- if (!ret)
- *data = msr.data;
- return ret;
-}
-
-int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
-{
- return __kvm_set_msr(vcpu, index, data, true);
-}
-
-int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
- return __kvm_get_msr(vcpu, index, data, true);
-}
-
-static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
- u32 index, u64 *data, bool host_initiated)
-{
- return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
- __kvm_get_msr);
-}
-
-int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
- return kvm_get_msr_ignored_check(vcpu, index, data, false);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read);
-
-int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
-{
- return kvm_set_msr_ignored_check(vcpu, index, data, false);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write);
-
-int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
- if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
- return KVM_MSR_RET_FILTERED;
-
- return __kvm_emulate_msr_read(vcpu, index, data);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read);
-
-int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
-{
- if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
- return KVM_MSR_RET_FILTERED;
-
- return __kvm_emulate_msr_write(vcpu, index, data);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write);
-
-
-static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->run->msr.error) {
- kvm_eax_write(vcpu, vcpu->run->msr.data);
- kvm_edx_write(vcpu, vcpu->run->msr.data >> 32);
- }
-}
-
-static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
-{
- return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
-}
-
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
-{
- complete_userspace_rdmsr(vcpu);
- return complete_emulated_msr_access(vcpu);
-}
-
-static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
-}
-
-static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
-{
- complete_userspace_rdmsr(vcpu);
- return complete_fast_msr_access(vcpu);
-}
-
-static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->run->msr.error)
- kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
- vcpu->run->msr.data);
-
- return complete_fast_msr_access(vcpu);
-}
-
-static u64 kvm_msr_reason(int r)
-{
- switch (r) {
- case KVM_MSR_RET_UNSUPPORTED:
- return KVM_MSR_EXIT_REASON_UNKNOWN;
- case KVM_MSR_RET_FILTERED:
- return KVM_MSR_EXIT_REASON_FILTER;
- default:
- return KVM_MSR_EXIT_REASON_INVAL;
- }
-}
-
-static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
- u32 exit_reason, u64 data,
- int (*completion)(struct kvm_vcpu *vcpu),
- int r)
-{
- u64 msr_reason = kvm_msr_reason(r);
-
- /* Check if the user wanted to know about this MSR fault */
- if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
- return 0;
-
- vcpu->run->exit_reason = exit_reason;
- vcpu->run->msr.error = 0;
- memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
- vcpu->run->msr.reason = msr_reason;
- vcpu->run->msr.index = index;
- vcpu->run->msr.data = data;
- vcpu->arch.complete_userspace_io = completion;
-
- return 1;
-}
-
-static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg,
- int (*complete_rdmsr)(struct kvm_vcpu *))
-{
- u64 data;
- int r;
-
- r = kvm_emulate_msr_read(vcpu, msr, &data);
-
- if (!r) {
- trace_kvm_msr_read(msr, data);
-
- if (reg < 0) {
- kvm_eax_write(vcpu, data);
- kvm_edx_write(vcpu, data >> 32);
- } else {
- kvm_register_write(vcpu, reg, data);
- }
- } else {
- /* MSR read failed? See if we should ask user space */
- if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0,
- complete_rdmsr, r))
- return 0;
- trace_kvm_msr_read_ex(msr);
- }
-
- return kvm_x86_call(complete_emulated_msr)(vcpu, r);
-}
-
-int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
-{
- return __kvm_emulate_rdmsr(vcpu, kvm_ecx_read(vcpu), -1,
- complete_fast_rdmsr);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr);
-
-int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
-{
- vcpu->arch.cui_rdmsr_imm_reg = reg;
-
- return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm);
-
-static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- int r;
-
- r = kvm_emulate_msr_write(vcpu, msr, data);
- if (!r) {
- trace_kvm_msr_write(msr, data);
- } else {
- /* MSR write failed? See if we should ask user space */
- if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data,
- complete_fast_msr_access, r))
- return 0;
- /* Signal all other negative errors to userspace */
- if (r < 0)
- return r;
- trace_kvm_msr_write_ex(msr, data);
- }
-
- return kvm_x86_call(complete_emulated_msr)(vcpu, r);
-}
-
-int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
-{
- return __kvm_emulate_wrmsr(vcpu, kvm_ecx_read(vcpu),
- kvm_read_edx_eax(vcpu));
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr);
-
-int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
-{
- return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm);
-
int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
{
return kvm_skip_emulated_instruction(vcpu);
@@ -2252,72 +856,6 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending();
}
-static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu))
- return EXIT_FASTPATH_NONE;
-
- switch (msr) {
- case APIC_BASE_MSR + (APIC_ICR >> 4):
- if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
- kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
- return EXIT_FASTPATH_NONE;
- break;
- case MSR_IA32_TSC_DEADLINE:
- kvm_set_lapic_tscdeadline_msr(vcpu, data);
- break;
- default:
- return EXIT_FASTPATH_NONE;
- }
-
- trace_kvm_msr_write(msr, data);
-
- if (!kvm_skip_emulated_instruction(vcpu))
- return EXIT_FASTPATH_EXIT_USERSPACE;
-
- return EXIT_FASTPATH_REENTER_GUEST;
-}
-
-fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu)
-{
- return __handle_fastpath_wrmsr(vcpu, kvm_ecx_read(vcpu),
- kvm_read_edx_eax(vcpu));
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr);
-
-fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
-{
- return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm);
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
- return kvm_get_msr_ignored_check(vcpu, index, data, true);
-}
-
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
- u64 val;
-
- /*
- * Reject writes to immutable feature MSRs if the vCPU model is frozen,
- * as KVM doesn't support modifying the guest vCPU model on the fly,
- * e.g. changing the VMX capabilities MSRs while L2 is active is
- * nonsensical. Allow writes of the same value, e.g. so that userspace
- * can blindly stuff all MSRs when emulating RESET.
- */
- if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) &&
- kvm_is_immutable_feature_msr(index) &&
- (do_get_msr(vcpu, index, &val) || *data != val))
- return -EINVAL;
-
- return kvm_set_msr_ignored_check(vcpu, index, *data, true);
-}
-
#ifdef CONFIG_X86_64
struct pvclock_clock {
int vclock_mode;
@@ -2384,72 +922,6 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
-{
- int version;
- int r;
- struct pvclock_wall_clock wc;
- u32 wc_sec_hi;
- u64 wall_nsec;
-
- if (!wall_clock)
- return;
-
- r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
- if (r)
- return;
-
- if (version & 1)
- ++version; /* first time write, random junk */
-
- ++version;
-
- if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
- return;
-
- wall_nsec = kvm_get_wall_clock_epoch(kvm);
-
- wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
- wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
- wc.version = version;
-
- kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
-
- if (sec_hi_ofs) {
- wc_sec_hi = wall_nsec >> 32;
- kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
- &wc_sec_hi, sizeof(wc_sec_hi));
- }
-
- version++;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
-}
-
-static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
- bool old_msr, bool host_initiated)
-{
- struct kvm_arch *ka = &vcpu->kvm->arch;
-
- if (vcpu->vcpu_id == 0 && !host_initiated) {
- if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-
- ka->boot_vcpu_runs_old_kvmclock = old_msr;
- }
-
- vcpu->arch.time = system_time;
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
-
- /* we verify if the enable bit is set... */
- if (system_time & 1)
- kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info));
- else
- kvm_gpc_deactivate(&vcpu->arch.pv_time);
-
- return;
-}
-
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
do_shl32_div32(dividend, divisor);
@@ -2642,7 +1114,7 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio)
return _tsc;
}
-static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
@@ -2683,7 +1155,7 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier);
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
{
if (vcpu->arch.guest_tsc_protected)
return;
@@ -2797,7 +1269,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
kvm_track_tsc_matching(vcpu, !matched);
}
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
+void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
{
u64 data = user_value ? *user_value : 0;
struct kvm *kvm = vcpu->kvm;
@@ -2865,22 +1337,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
}
-static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
- s64 adjustment)
-{
- u64 tsc_offset = vcpu->arch.l1_tsc_offset;
- kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
-}
-
-static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
-{
- if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
- WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc((u64) adjustment,
- vcpu->arch.l1_tsc_scaling_ratio);
- adjust_tsc_offset_guest(vcpu, adjustment);
-}
-
#ifdef CONFIG_X86_64
static u64 read_tsc(void)
@@ -3510,151 +1966,6 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
}
}
-/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
-static bool is_mci_control_msr(u32 msr)
-{
- return (msr & 3) == 0;
-}
-static bool is_mci_status_msr(u32 msr)
-{
- return (msr & 3) == 1;
-}
-
-/*
- * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
- */
-static bool can_set_mci_status(struct kvm_vcpu *vcpu)
-{
- /* McStatusWrEn enabled? */
- if (guest_cpuid_is_amd_compatible(vcpu))
- return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
-
- return false;
-}
-
-static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- u64 mcg_cap = vcpu->arch.mcg_cap;
- unsigned bank_num = mcg_cap & 0xff;
- u32 msr = msr_info->index;
- u64 data = msr_info->data;
- u32 offset, last_msr;
-
- switch (msr) {
- case MSR_IA32_MCG_STATUS:
- vcpu->arch.mcg_status = data;
- break;
- case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P) &&
- (data || !msr_info->host_initiated))
- return 1;
- if (data != 0 && data != ~(u64)0)
- return 1;
- vcpu->arch.mcg_ctl = data;
- break;
- case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
- last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
- if (msr > last_msr)
- return 1;
-
- if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
- return 1;
- /* An attempt to write a 1 to a reserved bit raises #GP */
- if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
- return 1;
- offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
- last_msr + 1 - MSR_IA32_MC0_CTL2);
- vcpu->arch.mci_ctl2_banks[offset] = data;
- break;
- case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
- if (msr > last_msr)
- return 1;
-
- /*
- * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
- * values are architecturally undefined. But, some Linux
- * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
- * issue on AMD K8s, allow bit 10 to be clear when setting all
- * other bits in order to avoid an uncaught #GP in the guest.
- *
- * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
- * single-bit ECC data errors.
- */
- if (is_mci_control_msr(msr) &&
- data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
- return 1;
-
- /*
- * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
- * AMD-based CPUs allow non-zero values, but if and only if
- * HWCR[McStatusWrEn] is set.
- */
- if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
- data != 0 && !can_set_mci_status(vcpu))
- return 1;
-
- offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
- last_msr + 1 - MSR_IA32_MC0_CTL);
- vcpu->arch.mce_banks[offset] = data;
- break;
- default:
- return 1;
- }
- return 0;
-}
-
-static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
-{
- gpa_t gpa = data & ~0x3f;
-
- /* Bits 4:5 are reserved, Should be zero */
- if (data & 0x30)
- return 1;
-
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
- (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
- return 1;
-
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
- (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
- return 1;
-
- if (!lapic_in_kernel(vcpu))
- return data ? 1 : 0;
-
- if (__kvm_pv_async_pf_enabled(data) &&
- kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
- sizeof(u64)))
- return 1;
-
- vcpu->arch.apf.msr_en_val = data;
-
- if (__kvm_pv_async_pf_enabled(data)) {
- kvm_async_pf_wakeup_all(vcpu);
- } else {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
- }
- return 0;
-}
-
-static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
-{
- /* Bits 8-63 are reserved */
- if (data >> 8)
- return 1;
-
- if (!lapic_in_kernel(vcpu))
- return 1;
-
- vcpu->arch.apf.msr_int_val = data;
-
- vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
-
- return 0;
-}
-
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
kvm_gpc_deactivate(&vcpu->arch.pv_time);
@@ -3815,899 +2126,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
-/*
- * Returns true if the MSR in question is managed via XSTATE, i.e. is context
- * switched with the rest of guest FPU state.
- *
- * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
- */
-static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
- if (!vcpu)
- return false;
-
- switch (msr) {
- case MSR_IA32_U_CET:
- return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ||
- guest_cpu_cap_has(vcpu, X86_FEATURE_IBT);
- case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
- return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
- default:
- return false;
- }
-}
-
-/*
- * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
- * MSR that is managed via XSTATE. Note, the caller is responsible for doing
- * the initial FPU load, this helper only ensures that guest state is resident
- * in hardware (the kernel can load its FPU state in IRQ context).
- *
- * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
- * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
- * consumed when transitioning to lower privilege levels, i.e. are effectively
- * only consumed by userspace as well.
- */
-static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
- struct msr_data *msr_info,
- int access)
-{
- BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W);
-
- KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
- KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
-
- kvm_fpu_get();
- if (access == MSR_TYPE_R)
- rdmsrq(msr_info->index, msr_info->data);
- else
- wrmsrq(msr_info->index, msr_info->data);
- kvm_fpu_put();
-}
-
-static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W);
-}
-
-static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R);
-}
-
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- u32 msr = msr_info->index;
- u64 data = msr_info->data;
-
- /*
- * Do not allow host-initiated writes to trigger the Xen hypercall
- * page setup; it could incur locking paths which are not expected
- * if userspace sets the MSR in an unusual location.
- */
- if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
- !msr_info->host_initiated)
- return kvm_xen_write_hypercall_page(vcpu, data);
-
- switch (msr) {
- case MSR_AMD64_NB_CFG:
- case MSR_IA32_UCODE_WRITE:
- case MSR_VM_HSAVE_PA:
- case MSR_AMD64_PATCH_LOADER:
- case MSR_AMD64_BU_CFG2:
- case MSR_AMD64_DC_CFG:
- case MSR_AMD64_TW_CFG:
- case MSR_F15H_EX_CFG:
- break;
-
- case MSR_IA32_UCODE_REV:
- if (msr_info->host_initiated)
- vcpu->arch.microcode_version = data;
- break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated ||
- !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
- return KVM_MSR_RET_UNSUPPORTED;
- vcpu->arch.arch_capabilities = data;
- break;
- case MSR_IA32_PERF_CAPABILITIES:
- if (!msr_info->host_initiated ||
- !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
- return KVM_MSR_RET_UNSUPPORTED;
-
- if (data & ~kvm_caps.supported_perf_cap)
- return 1;
-
- /*
- * Note, this is not just a performance optimization! KVM
- * disallows changing feature MSRs after the vCPU has run; PMU
- * refresh will bug the VM if called after the vCPU has run.
- */
- if (vcpu->arch.perf_capabilities == data)
- break;
-
- vcpu->arch.perf_capabilities = data;
- kvm_pmu_refresh(vcpu);
- kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
- break;
- case MSR_IA32_PRED_CMD: {
- u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
-
- if (!msr_info->host_initiated) {
- if ((!guest_has_pred_cmd_msr(vcpu)))
- return 1;
-
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB))
- reserved_bits |= PRED_CMD_IBPB;
-
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB))
- reserved_bits |= PRED_CMD_SBPB;
- }
-
- if (!boot_cpu_has(X86_FEATURE_IBPB))
- reserved_bits |= PRED_CMD_IBPB;
-
- if (!boot_cpu_has(X86_FEATURE_SBPB))
- reserved_bits |= PRED_CMD_SBPB;
-
- if (data & reserved_bits)
- return 1;
-
- if (!data)
- break;
-
- wrmsrq(MSR_IA32_PRED_CMD, data);
- break;
- }
- case MSR_IA32_FLUSH_CMD:
- if (!msr_info->host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D))
- return 1;
-
- if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
- return 1;
- if (!data)
- break;
-
- wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
- break;
- case MSR_EFER:
- return set_efer(vcpu, msr_info);
- case MSR_K7_HWCR: {
- /*
- * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
- * through at least v6.6 whine if TscFreqSel is clear,
- * depending on F/M/S.
- */
- u64 valid = BIT_ULL(18) | BIT_ULL(24);
-
- data &= ~(u64)0x40; /* ignore flush filter disable */
- data &= ~(u64)0x100; /* ignore ignne emulation enable */
- data &= ~(u64)0x8; /* ignore TLB cache disable */
-
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_GP_ON_USER_CPUID))
- valid |= MSR_K7_HWCR_CPUID_USER_DIS;
-
- if (data & ~valid) {
- kvm_pr_unimpl_wrmsr(vcpu, msr, data);
- return 1;
- }
- vcpu->arch.msr_hwcr = data;
- break;
- }
- case MSR_FAM10H_MMIO_CONF_BASE:
- if (data != 0) {
- kvm_pr_unimpl_wrmsr(vcpu, msr, data);
- return 1;
- }
- break;
- case MSR_IA32_CR_PAT:
- if (!kvm_pat_valid(data))
- return 1;
-
- vcpu->arch.pat = data;
- break;
- case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
- case MSR_MTRRdefType:
- return kvm_mtrr_set_msr(vcpu, msr, data);
- case MSR_IA32_APICBASE:
- return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
- return kvm_x2apic_msr_write(vcpu, msr, data);
- case MSR_IA32_TSC_DEADLINE:
- kvm_set_lapic_tscdeadline_msr(vcpu, data);
- break;
- case MSR_IA32_TSC_ADJUST:
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
- if (!msr_info->host_initiated) {
- s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
- adjust_tsc_offset_guest(vcpu, adj);
- /* Before back to guest, tsc_timestamp must be adjusted
- * as well, otherwise guest's percpu pvclock time could jump.
- */
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
- vcpu->arch.ia32_tsc_adjust_msr = data;
- }
- break;
- case MSR_IA32_MISC_ENABLE: {
- u64 old_val = vcpu->arch.ia32_misc_enable_msr;
-
- if (!msr_info->host_initiated) {
- /* RO bits */
- if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
- return 1;
-
- /* R bits, i.e. writes are ignored, but don't fault. */
- data = data & ~MSR_IA32_MISC_ENABLE_EMON;
- data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
- }
-
- if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
- ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3))
- return 1;
- vcpu->arch.ia32_misc_enable_msr = data;
- vcpu->arch.cpuid_dynamic_bits_dirty = true;
- } else {
- vcpu->arch.ia32_misc_enable_msr = data;
- }
- break;
- }
- case MSR_IA32_SMBASE:
- if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
- return 1;
- vcpu->arch.smbase = data;
- break;
- case MSR_IA32_POWER_CTL:
- vcpu->arch.msr_ia32_power_ctl = data;
- break;
- case MSR_IA32_TSC:
- if (msr_info->host_initiated) {
- kvm_synchronize_tsc(vcpu, &data);
- } else if (!vcpu->arch.guest_tsc_protected) {
- u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
- adjust_tsc_offset_guest(vcpu, adj);
- vcpu->arch.ia32_tsc_adjust_msr += adj;
- }
- break;
- case MSR_IA32_XSS:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- return KVM_MSR_RET_UNSUPPORTED;
-
- if (data & ~vcpu->arch.guest_supported_xss)
- return 1;
- if (vcpu->arch.ia32_xss == data)
- break;
- vcpu->arch.ia32_xss = data;
- vcpu->arch.cpuid_dynamic_bits_dirty = true;
- break;
- case MSR_SMI_COUNT:
- if (!msr_info->host_initiated)
- return 1;
- vcpu->arch.smi_count = data;
- break;
- case MSR_KVM_WALL_CLOCK_NEW:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return KVM_MSR_RET_UNSUPPORTED;
-
- vcpu->kvm->arch.wall_clock = data;
- kvm_write_wall_clock(vcpu->kvm, data, 0);
- break;
- case MSR_KVM_WALL_CLOCK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return KVM_MSR_RET_UNSUPPORTED;
-
- vcpu->kvm->arch.wall_clock = data;
- kvm_write_wall_clock(vcpu->kvm, data, 0);
- break;
- case MSR_KVM_SYSTEM_TIME_NEW:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return KVM_MSR_RET_UNSUPPORTED;
-
- kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
- break;
- case MSR_KVM_SYSTEM_TIME:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return KVM_MSR_RET_UNSUPPORTED;
-
- kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
- break;
- case MSR_KVM_ASYNC_PF_EN:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
- return KVM_MSR_RET_UNSUPPORTED;
-
- if (kvm_pv_enable_async_pf(vcpu, data))
- return 1;
- break;
- case MSR_KVM_ASYNC_PF_INT:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return KVM_MSR_RET_UNSUPPORTED;
-
- if (kvm_pv_enable_async_pf_int(vcpu, data))
- return 1;
- break;
- case MSR_KVM_ASYNC_PF_ACK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return KVM_MSR_RET_UNSUPPORTED;
- if (data & 0x1) {
- /*
- * Pairs with the smp_mb__after_atomic() in
- * kvm_arch_async_page_present_queued().
- */
- smp_store_mb(vcpu->arch.apf.pageready_pending, false);
-
- kvm_check_async_pf_completion(vcpu);
- }
- break;
- case MSR_KVM_STEAL_TIME:
- if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
- return KVM_MSR_RET_UNSUPPORTED;
-
- if (unlikely(!sched_info_on()))
- return 1;
-
- if (data & KVM_STEAL_RESERVED_MASK)
- return 1;
-
- vcpu->arch.st.msr_val = data;
-
- if (!(data & KVM_MSR_ENABLED))
- break;
-
- kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
-
- break;
- case MSR_KVM_PV_EOI_EN:
- if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
- return KVM_MSR_RET_UNSUPPORTED;
-
- if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
- return 1;
- break;
-
- case MSR_KVM_POLL_CONTROL:
- if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
- return KVM_MSR_RET_UNSUPPORTED;
-
- /* only enable bit supported */
- if (data & (-1ULL << 1))
- return 1;
-
- vcpu->arch.msr_kvm_poll_control = data;
- break;
-
- case MSR_IA32_MCG_CTL:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
- return set_msr_mce(vcpu, msr_info);
-
- case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
- case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
- case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
- if (kvm_pmu_is_valid_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr_info);
-
- if (data)
- kvm_pr_unimpl_wrmsr(vcpu, msr, data);
- break;
- case MSR_K7_CLK_CTL:
- /*
- * Ignore all writes to this no longer documented MSR.
- * Writes are only relevant for old K7 processors,
- * all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to specify the
- * affected processor models on the command line, hence
- * the need to ignore the workaround.
- */
- break;
-#ifdef CONFIG_KVM_HYPERV
- case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
- case HV_X64_MSR_SYNDBG_OPTIONS:
- case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- case HV_X64_MSR_CRASH_CTL:
- case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
- case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
- case HV_X64_MSR_TSC_EMULATION_CONTROL:
- case HV_X64_MSR_TSC_EMULATION_STATUS:
- case HV_X64_MSR_TSC_INVARIANT_CONTROL:
- return kvm_hv_set_msr_common(vcpu, msr, data,
- msr_info->host_initiated);
-#endif
- case MSR_IA32_BBL_CR_CTL3:
- /* Drop writes to this legacy MSR -- see rdmsr
- * counterpart for further detail.
- */
- kvm_pr_unimpl_wrmsr(vcpu, msr, data);
- break;
- case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
- return 1;
- vcpu->arch.osvw.length = data;
- break;
- case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
- return 1;
- vcpu->arch.osvw.status = data;
- break;
- case MSR_PLATFORM_INFO:
- if (!msr_info->host_initiated)
- return 1;
- vcpu->arch.msr_platform_info = data;
- break;
- case MSR_MISC_FEATURES_ENABLES:
- if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
- (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
- !(vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT)))
- return 1;
- vcpu->arch.msr_misc_features_enables = data;
- break;
-#ifdef CONFIG_X86_64
- case MSR_IA32_XFD:
- if (!msr_info->host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
- return 1;
-
- if (data & ~kvm_guest_supported_xfd(vcpu))
- return 1;
-
- fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
- break;
- case MSR_IA32_XFD_ERR:
- if (!msr_info->host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
- return 1;
-
- if (data & ~kvm_guest_supported_xfd(vcpu))
- return 1;
-
- vcpu->arch.guest_fpu.xfd_err = data;
- break;
-#endif
- case MSR_IA32_U_CET:
- case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
- kvm_set_xstate_msr(vcpu, msr_info);
- break;
- default:
- if (kvm_pmu_is_valid_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr_info);
-
- return KVM_MSR_RET_UNSUPPORTED;
- }
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common);
-
-static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
-{
- u64 data;
- u64 mcg_cap = vcpu->arch.mcg_cap;
- unsigned bank_num = mcg_cap & 0xff;
- u32 offset, last_msr;
-
- switch (msr) {
- case MSR_IA32_P5_MC_ADDR:
- case MSR_IA32_P5_MC_TYPE:
- data = 0;
- break;
- case MSR_IA32_MCG_CAP:
- data = vcpu->arch.mcg_cap;
- break;
- case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P) && !host)
- return 1;
- data = vcpu->arch.mcg_ctl;
- break;
- case MSR_IA32_MCG_STATUS:
- data = vcpu->arch.mcg_status;
- break;
- case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
- last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
- if (msr > last_msr)
- return 1;
-
- if (!(mcg_cap & MCG_CMCI_P) && !host)
- return 1;
- offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
- last_msr + 1 - MSR_IA32_MC0_CTL2);
- data = vcpu->arch.mci_ctl2_banks[offset];
- break;
- case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
- if (msr > last_msr)
- return 1;
-
- offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
- last_msr + 1 - MSR_IA32_MC0_CTL);
- data = vcpu->arch.mce_banks[offset];
- break;
- default:
- return 1;
- }
- *pdata = data;
- return 0;
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- switch (msr_info->index) {
- case MSR_IA32_PLATFORM_ID:
- case MSR_IA32_EBL_CR_POWERON:
- case MSR_IA32_LASTBRANCHFROMIP:
- case MSR_IA32_LASTBRANCHTOIP:
- case MSR_IA32_LASTINTFROMIP:
- case MSR_IA32_LASTINTTOIP:
- case MSR_AMD64_SYSCFG:
- case MSR_K8_TSEG_ADDR:
- case MSR_K8_TSEG_MASK:
- case MSR_VM_HSAVE_PA:
- case MSR_K8_INT_PENDING_MSG:
- case MSR_AMD64_NB_CFG:
- case MSR_FAM10H_MMIO_CONF_BASE:
- case MSR_AMD64_BU_CFG2:
- case MSR_IA32_PERF_CTL:
- case MSR_AMD64_DC_CFG:
- case MSR_AMD64_TW_CFG:
- case MSR_F15H_EX_CFG:
- /*
- * Intel Sandy Bridge CPUs must support the RAPL (running average power
- * limit) MSRs. Just return 0, as we do not want to expose the host
- * data here. Do not conditionalize this on CPUID, as KVM does not do
- * so for existing CPU-specific MSRs.
- */
- case MSR_RAPL_POWER_UNIT:
- case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
- case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
- case MSR_PKG_ENERGY_STATUS: /* Total package */
- case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
- msr_info->data = 0;
- break;
- case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
- case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
- if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info);
- msr_info->data = 0;
- break;
- case MSR_IA32_UCODE_REV:
- msr_info->data = vcpu->arch.microcode_version;
- break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
- return KVM_MSR_RET_UNSUPPORTED;
- msr_info->data = vcpu->arch.arch_capabilities;
- break;
- case MSR_IA32_PERF_CAPABILITIES:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
- return KVM_MSR_RET_UNSUPPORTED;
- msr_info->data = vcpu->arch.perf_capabilities;
- break;
- case MSR_IA32_POWER_CTL:
- msr_info->data = vcpu->arch.msr_ia32_power_ctl;
- break;
- case MSR_IA32_TSC: {
- /*
- * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
- * even when not intercepted. AMD manual doesn't explicitly
- * state this but appears to behave the same.
- *
- * On userspace reads and writes, however, we unconditionally
- * return L1's TSC value to ensure backwards-compatible
- * behavior for migration.
- */
- u64 offset, ratio;
-
- if (msr_info->host_initiated) {
- offset = vcpu->arch.l1_tsc_offset;
- ratio = vcpu->arch.l1_tsc_scaling_ratio;
- } else {
- offset = vcpu->arch.tsc_offset;
- ratio = vcpu->arch.tsc_scaling_ratio;
- }
-
- msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
- break;
- }
- case MSR_IA32_CR_PAT:
- msr_info->data = vcpu->arch.pat;
- break;
- case MSR_MTRRcap:
- case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
- case MSR_MTRRdefType:
- return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
- case 0xcd: /* fsb frequency */
- msr_info->data = 3;
- break;
- /*
- * MSR_EBC_FREQUENCY_ID
- * Conservative value valid for even the basic CPU models.
- * Models 0,1: 000 in bits 23:21 indicating a bus speed of
- * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
- * and 266MHz for model 3, or 4. Set Core Clock
- * Frequency to System Bus Frequency Ratio to 1 (bits
- * 31:24) even though these are only valid for CPU
- * models > 2, however guests may end up dividing or
- * multiplying by zero otherwise.
- */
- case MSR_EBC_FREQUENCY_ID:
- msr_info->data = 1 << 24;
- break;
- case MSR_IA32_APICBASE:
- msr_info->data = vcpu->arch.apic_base;
- break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
- return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- case MSR_IA32_TSC_DEADLINE:
- msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
- break;
- case MSR_IA32_TSC_ADJUST:
- msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
- break;
- case MSR_IA32_MISC_ENABLE:
- msr_info->data = vcpu->arch.ia32_misc_enable_msr;
- break;
- case MSR_IA32_SMBASE:
- if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
- return 1;
- msr_info->data = vcpu->arch.smbase;
- break;
- case MSR_SMI_COUNT:
- msr_info->data = vcpu->arch.smi_count;
- break;
- case MSR_IA32_PERF_STATUS:
- /* TSC increment by tick */
- msr_info->data = 1000ULL;
- /* CPU multiplier */
- msr_info->data |= (((uint64_t)4ULL) << 40);
- break;
- case MSR_EFER:
- msr_info->data = vcpu->arch.efer;
- break;
- case MSR_KVM_WALL_CLOCK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->kvm->arch.wall_clock;
- break;
- case MSR_KVM_WALL_CLOCK_NEW:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->kvm->arch.wall_clock;
- break;
- case MSR_KVM_SYSTEM_TIME:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.time;
- break;
- case MSR_KVM_SYSTEM_TIME_NEW:
- if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.time;
- break;
- case MSR_KVM_ASYNC_PF_EN:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.apf.msr_en_val;
- break;
- case MSR_KVM_ASYNC_PF_INT:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.apf.msr_int_val;
- break;
- case MSR_KVM_ASYNC_PF_ACK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = 0;
- break;
- case MSR_KVM_STEAL_TIME:
- if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.st.msr_val;
- break;
- case MSR_KVM_PV_EOI_EN:
- if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.pv_eoi.msr_val;
- break;
- case MSR_KVM_POLL_CONTROL:
- if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
- return KVM_MSR_RET_UNSUPPORTED;
-
- msr_info->data = vcpu->arch.msr_kvm_poll_control;
- break;
- case MSR_IA32_P5_MC_ADDR:
- case MSR_IA32_P5_MC_TYPE:
- case MSR_IA32_MCG_CAP:
- case MSR_IA32_MCG_CTL:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
- return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
- msr_info->host_initiated);
- case MSR_IA32_XSS:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- return 1;
- msr_info->data = vcpu->arch.ia32_xss;
- break;
- case MSR_K7_CLK_CTL:
- /*
- * Provide expected ramp-up count for K7. All other
- * are set to zero, indicating minimum divisors for
- * every field.
- *
- * This prevents guest kernels on AMD host with CPU
- * type 6, model 8 and higher from exploding due to
- * the rdmsr failing.
- */
- msr_info->data = 0x20000000;
- break;
-#ifdef CONFIG_KVM_HYPERV
- case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
- case HV_X64_MSR_SYNDBG_OPTIONS:
- case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- case HV_X64_MSR_CRASH_CTL:
- case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
- case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
- case HV_X64_MSR_TSC_EMULATION_CONTROL:
- case HV_X64_MSR_TSC_EMULATION_STATUS:
- case HV_X64_MSR_TSC_INVARIANT_CONTROL:
- return kvm_hv_get_msr_common(vcpu,
- msr_info->index, &msr_info->data,
- msr_info->host_initiated);
-#endif
- case MSR_IA32_BBL_CR_CTL3:
- /* This legacy MSR exists but isn't fully documented in current
- * silicon. It is however accessed by winxp in very narrow
- * scenarios where it sets bit #19, itself documented as
- * a "reserved" bit. Best effort attempt to source coherent
- * read data here should the balance of the register be
- * interpreted by the guest:
- *
- * L2 cache control register 3: 64GB range, 256KB size,
- * enabled, latency 0x1, configured
- */
- msr_info->data = 0xbe702111;
- break;
- case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
- return 1;
- msr_info->data = vcpu->arch.osvw.length;
- break;
- case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
- return 1;
- msr_info->data = vcpu->arch.osvw.status;
- break;
- case MSR_PLATFORM_INFO:
- if (!msr_info->host_initiated &&
- !vcpu->kvm->arch.guest_can_read_msr_platform_info)
- return 1;
- msr_info->data = vcpu->arch.msr_platform_info;
- break;
- case MSR_MISC_FEATURES_ENABLES:
- msr_info->data = vcpu->arch.msr_misc_features_enables;
- break;
- case MSR_K7_HWCR:
- msr_info->data = vcpu->arch.msr_hwcr;
- break;
-#ifdef CONFIG_X86_64
- case MSR_IA32_XFD:
- if (!msr_info->host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
- return 1;
-
- msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
- break;
- case MSR_IA32_XFD_ERR:
- if (!msr_info->host_initiated &&
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
- return 1;
-
- msr_info->data = vcpu->arch.guest_fpu.xfd_err;
- break;
-#endif
- case MSR_IA32_U_CET:
- case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
- kvm_get_xstate_msr(vcpu, msr_info);
- break;
- default:
- if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info);
-
- return KVM_MSR_RET_UNSUPPORTED;
- }
- return 0;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common);
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
- struct kvm_msr_entry *entries,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data))
-{
- bool fpu_loaded = false;
- int i;
-
- for (i = 0; i < msrs->nmsrs; ++i) {
- /*
- * If userspace is accessing one or more XSTATE-managed MSRs,
- * temporarily load the guest's FPU state so that the guest's
- * MSR value(s) is resident in hardware and thus can be accessed
- * via RDMSR/WRMSR.
- */
- if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) {
- kvm_load_guest_fpu(vcpu);
- fpu_loaded = true;
- }
- if (do_msr(vcpu, entries[i].index, &entries[i].data))
- break;
- }
- if (fpu_loaded)
- kvm_put_guest_fpu(vcpu);
-
- return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
- int (*do_msr)(struct kvm_vcpu *vcpu,
- unsigned index, u64 *data),
- int writeback)
-{
- struct kvm_msrs msrs;
- struct kvm_msr_entry *entries;
- unsigned size;
- int r;
-
- r = -EFAULT;
- if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
- goto out;
-
- r = -E2BIG;
- if (msrs.nmsrs >= MAX_IO_MSRS)
- goto out;
-
- size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = memdup_user(user_msrs->entries, size);
- if (IS_ERR(entries)) {
- r = PTR_ERR(entries);
- goto out;
- }
-
- r = __msr_io(vcpu, &msrs, entries, do_msr);
-
- if (writeback && copy_to_user(user_msrs->entries, entries, size))
- r = -EFAULT;
-
- kfree(entries);
-out:
- return r;
-}
-
static inline bool kvm_can_mwait_in_guest(void)
{
return boot_cpu_has(X86_FEATURE_MWAIT) &&
@@ -5026,32 +2444,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
long r;
switch (ioctl) {
- case KVM_GET_MSR_INDEX_LIST: {
- struct kvm_msr_list __user *user_msr_list = argp;
- struct kvm_msr_list msr_list;
- unsigned n;
-
- r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
- goto out;
- n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
- if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
- goto out;
- r = -E2BIG;
- if (n < msr_list.nmsrs)
- goto out;
- r = -EFAULT;
- if (copy_to_user(user_msr_list->indices, &msrs_to_save,
- num_msrs_to_save * sizeof(u32)))
- goto out;
- if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
- &emulated_msrs,
- num_emulated_msrs * sizeof(u32)))
- goto out;
- r = 0;
+ case KVM_GET_MSR_INDEX_LIST:
+ r = kvm_get_msr_index_list(argp);
break;
- }
case KVM_GET_SUPPORTED_CPUID:
case KVM_GET_EMULATED_CPUID: {
struct kvm_cpuid2 __user *cpuid_arg = argp;
@@ -5079,30 +2474,11 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
r = 0;
break;
- case KVM_GET_MSR_FEATURE_INDEX_LIST: {
- struct kvm_msr_list __user *user_msr_list = argp;
- struct kvm_msr_list msr_list;
- unsigned int n;
-
- r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
- goto out;
- n = msr_list.nmsrs;
- msr_list.nmsrs = num_msr_based_features;
- if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
- goto out;
- r = -E2BIG;
- if (n < msr_list.nmsrs)
- goto out;
- r = -EFAULT;
- if (copy_to_user(user_msr_list->indices, &msr_based_features,
- num_msr_based_features * sizeof(u32)))
- goto out;
- r = 0;
+ case KVM_GET_MSR_FEATURE_INDEX_LIST:
+ r = kvm_get_feature_msr_index_list(argp);
break;
- }
case KVM_GET_MSRS:
- r = msr_io(NULL, argp, do_get_feature_msr, 1);
+ r = kvm_get_feature_msrs(argp);
break;
#ifdef CONFIG_KVM_HYPERV
case KVM_GET_SUPPORTED_HV_CPUID:
@@ -5322,6 +2698,18 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
return 0;
}
+static bool kvm_is_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, .interrupt_allowed() returns -EBUSY if interrupts are allowed
+ * based on CPU state, but can't be immediately delivered due to a
+ * pending nested VM-Enter. Treat that case as "allowed", because
+ * the goal here is just to check if interrupts are architecturally
+ * allowed, not to check if they can be injected.
+ */
+ return kvm_x86_call(interrupt_allowed)(vcpu, false);
+}
+
static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
{
/*
@@ -5347,7 +2735,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
* or KVM_SET_SREGS. For that to work, we must be at an
* instruction boundary and with no events half-injected.
*/
- return (kvm_arch_interrupt_allowed(vcpu) &&
+ return (kvm_is_interrupt_allowed(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu) &&
!kvm_event_needs_reinjection(vcpu) &&
!kvm_is_exception_pending(vcpu));
@@ -5532,7 +2920,7 @@ static struct kvm_queued_exception *kvm_get_exception_to_save(struct kvm_vcpu *v
return &vcpu->arch.exception;
}
-static void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu)
+void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu);
@@ -5736,57 +3124,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}
-static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- unsigned int i;
-
- if (vcpu->kvm->arch.has_protected_state &&
- vcpu->arch.guest_state_protected)
- return -EINVAL;
-
- kvm_handle_exception_payload_quirk(vcpu);
-
- memset(dbgregs, 0, sizeof(*dbgregs));
-
- BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
- dbgregs->db[i] = vcpu->arch.db[i];
-
- dbgregs->dr6 = vcpu->arch.dr6;
- dbgregs->dr7 = vcpu->arch.dr7;
- return 0;
-}
-
-static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
-{
- unsigned int i;
-
- if (vcpu->kvm->arch.has_protected_state &&
- vcpu->arch.guest_state_protected)
- return -EINVAL;
-
- if (dbgregs->flags)
- return -EINVAL;
-
- if (!kvm_dr6_valid(dbgregs->dr6))
- return -EINVAL;
- if (!kvm_dr7_valid(dbgregs->dr7))
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
- vcpu->arch.db[i] = dbgregs->db[i];
-
- kvm_update_dr0123(vcpu);
- vcpu->arch.dr6 = dbgregs->dr6;
- vcpu->arch.dr7 = dbgregs->dr7;
- kvm_update_dr7(vcpu);
-
- return 0;
-}
-
-
static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
u8 *state, unsigned int size)
{
@@ -6058,134 +3395,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
}
}
-struct kvm_x86_reg_id {
- __u32 index;
- __u8 type;
- __u8 rsvd1;
- __u8 rsvd2:4;
- __u8 size:4;
- __u8 x86;
-};
-
-static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu,
- struct kvm_x86_reg_id *reg)
-{
- switch (reg->index) {
- case KVM_REG_GUEST_SSP:
- /*
- * FIXME: If host-initiated accesses are ever exempted from
- * ignore_msrs (in kvm_do_msr_access()), drop this manual check
- * and rely on KVM's standard checks to reject accesses to regs
- * that don't exist.
- */
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
- return -EINVAL;
-
- reg->type = KVM_X86_REG_TYPE_MSR;
- reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
-{
- u64 val;
-
- if (do_get_msr(vcpu, msr, &val))
- return -EINVAL;
-
- if (put_user(val, user_val))
- return -EFAULT;
-
- return 0;
-}
-
-static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
-{
- u64 val;
-
- if (get_user(val, user_val))
- return -EFAULT;
-
- if (do_set_msr(vcpu, msr, &val))
- return -EINVAL;
-
- return 0;
-}
-
-static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl,
- void __user *argp)
-{
- struct kvm_one_reg one_reg;
- struct kvm_x86_reg_id *reg;
- u64 __user *user_val;
- bool load_fpu;
- int r;
-
- if (copy_from_user(&one_reg, argp, sizeof(one_reg)))
- return -EFAULT;
-
- if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86)
- return -EINVAL;
-
- reg = (struct kvm_x86_reg_id *)&one_reg.id;
- if (reg->rsvd1 || reg->rsvd2)
- return -EINVAL;
-
- if (reg->type == KVM_X86_REG_TYPE_KVM) {
- r = kvm_translate_kvm_reg(vcpu, reg);
- if (r)
- return r;
- }
-
- if (reg->type != KVM_X86_REG_TYPE_MSR)
- return -EINVAL;
-
- if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64)
- return -EINVAL;
-
- guard(srcu)(&vcpu->kvm->srcu);
-
- load_fpu = is_xstate_managed_msr(vcpu, reg->index);
- if (load_fpu)
- kvm_load_guest_fpu(vcpu);
-
- user_val = u64_to_user_ptr(one_reg.addr);
- if (ioctl == KVM_GET_ONE_REG)
- r = kvm_get_one_msr(vcpu, reg->index, user_val);
- else
- r = kvm_set_one_msr(vcpu, reg->index, user_val);
-
- if (load_fpu)
- kvm_put_guest_fpu(vcpu);
- return r;
-}
-
-static int kvm_get_reg_list(struct kvm_vcpu *vcpu,
- struct kvm_reg_list __user *user_list)
-{
- u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0;
- u64 user_nr_regs;
-
- if (get_user(user_nr_regs, &user_list->n))
- return -EFAULT;
-
- if (put_user(nr_regs, &user_list->n))
- return -EFAULT;
-
- if (user_nr_regs < nr_regs)
- return -E2BIG;
-
- if (nr_regs &&
- put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
- return -EFAULT;
-
- return 0;
-}
-
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -6290,18 +3499,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
- case KVM_GET_MSRS: {
- int idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = msr_io(vcpu, argp, do_get_msr, 1);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ case KVM_GET_MSRS:
+ r = kvm_get_msrs(vcpu, argp);
break;
- }
- case KVM_SET_MSRS: {
- int idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = msr_io(vcpu, argp, do_set_msr, 0);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ case KVM_SET_MSRS:
+ r = kvm_set_msrs(vcpu, argp);
break;
- }
case KVM_GET_ONE_REG:
case KVM_SET_ONE_REG:
r = kvm_get_set_one_reg(vcpu, ioctl, argp);
@@ -6623,7 +3826,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -ENOMEM;
if (!u.sregs2)
goto out;
- __get_sregs2(vcpu, u.sregs2);
+ kvm_vcpu_ioctl_x86_get_sregs2(vcpu, u.sregs2);
r = -EFAULT;
if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
goto out;
@@ -6642,7 +3845,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
u.sregs2 = NULL;
goto out;
}
- r = __set_sregs2(vcpu, u.sregs2);
+ r = kvm_vcpu_ioctl_x86_set_sregs2(vcpu, u.sregs2);
break;
}
case KVM_HAS_DEVICE_ATTR:
@@ -6994,113 +4197,6 @@ disable_exits_unlock:
return r;
}
-static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
-{
- struct kvm_x86_msr_filter *msr_filter;
-
- msr_filter = kzalloc_obj(*msr_filter, GFP_KERNEL_ACCOUNT);
- if (!msr_filter)
- return NULL;
-
- msr_filter->default_allow = default_allow;
- return msr_filter;
-}
-
-static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
-{
- u32 i;
-
- if (!msr_filter)
- return;
-
- for (i = 0; i < msr_filter->count; i++)
- kfree(msr_filter->ranges[i].bitmap);
-
- kfree(msr_filter);
-}
-
-static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
- struct kvm_msr_filter_range *user_range)
-{
- unsigned long *bitmap;
- size_t bitmap_size;
-
- if (!user_range->nmsrs)
- return 0;
-
- if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
- return -EINVAL;
-
- if (!user_range->flags)
- return -EINVAL;
-
- bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
- if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
- return -EINVAL;
-
- bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
- if (IS_ERR(bitmap))
- return PTR_ERR(bitmap);
-
- msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
- .flags = user_range->flags,
- .base = user_range->base,
- .nmsrs = user_range->nmsrs,
- .bitmap = bitmap,
- };
-
- msr_filter->count++;
- return 0;
-}
-
-static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
- struct kvm_msr_filter *filter)
-{
- struct kvm_x86_msr_filter *new_filter, *old_filter;
- bool default_allow;
- bool empty = true;
- int r;
- u32 i;
-
- if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
- empty &= !filter->ranges[i].nmsrs;
-
- default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
- if (empty && !default_allow)
- return -EINVAL;
-
- new_filter = kvm_alloc_msr_filter(default_allow);
- if (!new_filter)
- return -ENOMEM;
-
- for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
- r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
- if (r) {
- kvm_free_msr_filter(new_filter);
- return r;
- }
- }
-
- mutex_lock(&kvm->lock);
- old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
- mutex_is_locked(&kvm->lock));
- mutex_unlock(&kvm->lock);
- synchronize_srcu(&kvm->srcu);
-
- kvm_free_msr_filter(old_filter);
-
- /*
- * Recalc MSR intercepts as userspace may want to intercept accesses to
- * MSRs that KVM would otherwise pass through to the guest.
- */
- kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS);
-
- return 0;
-}
-
#ifdef CONFIG_KVM_COMPAT
/* for KVM_X86_SET_MSR_FILTER */
struct kvm_msr_filter_range_compat {
@@ -7621,157 +4717,6 @@ out:
return r;
}
-static void kvm_probe_feature_msr(u32 msr_index)
-{
- u64 data;
-
- if (kvm_get_feature_msr(NULL, msr_index, &data, true))
- return;
-
- msr_based_features[num_msr_based_features++] = msr_index;
-}
-
-static void kvm_probe_msr_to_save(u32 msr_index)
-{
- u32 dummy[2];
-
- if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
- return;
-
- /*
- * Even MSRs that are valid in the host may not be exposed to guests in
- * some cases.
- */
- switch (msr_index) {
- case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
- return;
- break;
- case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
- !kvm_cpu_cap_has(X86_FEATURE_RDPID))
- return;
- break;
- case MSR_IA32_UMWAIT_CONTROL:
- if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
- return;
- break;
- case MSR_IA32_RTIT_CTL:
- case MSR_IA32_RTIT_STATUS:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
- return;
- break;
- case MSR_IA32_RTIT_CR3_MATCH:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
- return;
- break;
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
- !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
- return;
- break;
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- (msr_index - MSR_IA32_RTIT_ADDR0_A >=
- intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
- return;
- break;
- case MSR_ARCH_PERFMON_PERFCTR0 ...
- MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
- if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
- kvm_pmu_cap.num_counters_gp)
- return;
- break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ...
- MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
- if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
- kvm_pmu_cap.num_counters_gp)
- return;
- break;
- case MSR_ARCH_PERFMON_FIXED_CTR0 ...
- MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
- if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
- kvm_pmu_cap.num_counters_fixed)
- return;
- break;
- case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
- case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
- case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
- case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
- if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
- return;
- break;
- case MSR_IA32_XFD:
- case MSR_IA32_XFD_ERR:
- if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
- return;
- break;
- case MSR_IA32_TSX_CTRL:
- if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
- return;
- break;
- case MSR_IA32_XSS:
- if (!kvm_caps.supported_xss)
- return;
- break;
- case MSR_IA32_U_CET:
- case MSR_IA32_S_CET:
- if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
- !kvm_cpu_cap_has(X86_FEATURE_IBT))
- return;
- break;
- case MSR_IA32_INT_SSP_TAB:
- if (!kvm_cpu_cap_has(X86_FEATURE_LM))
- return;
- fallthrough;
- case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
- if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
- return;
- break;
- default:
- break;
- }
-
- msrs_to_save[num_msrs_to_save++] = msr_index;
-}
-
-static void kvm_init_msr_lists(void)
-{
- unsigned i;
-
- BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3,
- "Please update the fixed PMCs in msrs_to_save_pmu[]");
-
- num_msrs_to_save = 0;
- num_emulated_msrs = 0;
- num_msr_based_features = 0;
-
- for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
- kvm_probe_msr_to_save(msrs_to_save_base[i]);
-
- if (enable_pmu) {
- for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
- kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
- }
-
- for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_call(has_emulated_msr)(NULL,
- emulated_msrs_all[i]))
- continue;
-
- emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
- }
-
- for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
- kvm_probe_feature_msr(i);
-
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
- kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
-}
-
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
void *__v)
{
@@ -7821,36 +4766,24 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return handled;
}
-void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_call(set_segment)(vcpu, var, seg);
-}
-
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_call(get_segment)(vcpu, var, seg);
-}
-
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+ return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception);
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+ return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception);
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write);
@@ -7858,21 +4791,21 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
- return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
+ return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
+ gpa_t gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -7900,14 +4833,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
/* Inline kvm_read_guest_virt_helper for speed. */
- gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
- exception);
+ gpa_t gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, addr, access|PFERR_FETCH_MASK,
+ exception);
if (unlikely(gpa == INVALID_GPA))
return X86EMUL_PROPAGATE_FAULT;
@@ -7959,12 +4892,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
+ gpa_t gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -8065,7 +4998,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
@@ -8075,7 +5008,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
* shadow page table for L2 guest.
*/
if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
- !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ !permission_fault(vcpu, gva_walk,
vcpu->arch.mmio_access, 0, access))) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
@@ -8083,7 +5016,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
return 1;
}
- *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
+ *gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception);
if (*gpa == INVALID_GPA)
return -1;
@@ -8492,11 +5425,6 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
}
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- return kvm_x86_call(get_segment_base)(vcpu, seg);
-}
-
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
{
kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
@@ -8641,7 +5569,7 @@ static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
static unsigned long emulator_get_cached_segment_base(
struct x86_emulate_ctxt *ctxt, int seg)
{
- return get_segment_base(emul_to_vcpu(ctxt), seg);
+ return kvm_get_segment_base(emul_to_vcpu(ctxt), seg);
}
static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
@@ -8714,61 +5642,22 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int r;
-
- r = kvm_emulate_msr_read(vcpu, msr_index, pdata);
- if (r < 0)
- return X86EMUL_UNHANDLEABLE;
- if (r) {
- if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
- complete_emulated_rdmsr, r))
- return X86EMUL_IO_NEEDED;
-
- trace_kvm_msr_read_ex(msr_index);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- trace_kvm_msr_read(msr_index, *pdata);
- return X86EMUL_CONTINUE;
+ return kvm_emulator_get_msr_with_filter(vcpu, msr_index, pdata);
}
static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int r;
-
- r = kvm_emulate_msr_write(vcpu, msr_index, data);
- if (r < 0)
- return X86EMUL_UNHANDLEABLE;
- if (r) {
- if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
- complete_emulated_msr_access, r))
- return X86EMUL_IO_NEEDED;
-
- trace_kvm_msr_write_ex(msr_index, data);
- return X86EMUL_PROPAGATE_FAULT;
- }
-
- trace_kvm_msr_write(msr_index, data);
- return X86EMUL_CONTINUE;
+ return kvm_emulator_set_msr_with_filter(vcpu, msr_index, data);
}
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- /*
- * Treat emulator accesses to the current shadow stack pointer as host-
- * initiated, as they aren't true MSR accesses (SSP is a "just a reg"),
- * and this API is used only for implicit accesses, i.e. not RDMSR, and
- * so the index is fully KVM-controlled.
- */
- if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP))
- return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata);
-
- return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata);
+ return kvm_emulator_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
@@ -11596,7 +8485,7 @@ bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
return true;
- if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
+ if (kvm_is_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
return true;
if (kvm_hv_has_stimer_pending(vcpu))
@@ -11902,28 +8791,6 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
-/* Swap (qemu) user FPU context for the guest FPU context. */
-static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
- return;
-
- /* Exclude PKRU, it's restored separately immediately after VM-Exit. */
- fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
- trace_kvm_fpu(1);
-}
-
-/* When vcpu_run ends, restore user space FPU context. */
-static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
- return;
-
- fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
- ++vcpu->stat.fpu_reload;
- trace_kvm_fpu(0);
-}
-
static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
/*
@@ -12073,179 +8940,6 @@ out:
return r;
}
-static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
- if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
- /*
- * We are here if userspace calls get_regs() in the middle of
- * instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Userspace shouldn't do
- * that usually, but some bad designed PV devices (vmware
- * backdoor interface) need this to work
- */
- emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
- vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- }
- regs->rax = kvm_rax_read_raw(vcpu);
- regs->rbx = kvm_rbx_read_raw(vcpu);
- regs->rcx = kvm_rcx_read_raw(vcpu);
- regs->rdx = kvm_rdx_read_raw(vcpu);
- regs->rsi = kvm_rsi_read_raw(vcpu);
- regs->rdi = kvm_rdi_read_raw(vcpu);
- regs->rsp = kvm_rsp_read(vcpu);
- regs->rbp = kvm_rbp_read_raw(vcpu);
-#ifdef CONFIG_X86_64
- regs->r8 = kvm_r8_read_raw(vcpu);
- regs->r9 = kvm_r9_read_raw(vcpu);
- regs->r10 = kvm_r10_read_raw(vcpu);
- regs->r11 = kvm_r11_read_raw(vcpu);
- regs->r12 = kvm_r12_read_raw(vcpu);
- regs->r13 = kvm_r13_read_raw(vcpu);
- regs->r14 = kvm_r14_read_raw(vcpu);
- regs->r15 = kvm_r15_read_raw(vcpu);
-#endif
-
- regs->rip = kvm_rip_read(vcpu);
- regs->rflags = kvm_get_rflags(vcpu);
-}
-
-int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
- if (vcpu->kvm->arch.has_protected_state &&
- vcpu->arch.guest_state_protected)
- return -EINVAL;
-
- vcpu_load(vcpu);
- __get_regs(vcpu, regs);
- vcpu_put(vcpu);
- return 0;
-}
-
-static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
- vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
- vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-
- kvm_rax_write_raw(vcpu, regs->rax);
- kvm_rbx_write_raw(vcpu, regs->rbx);
- kvm_rcx_write_raw(vcpu, regs->rcx);
- kvm_rdx_write_raw(vcpu, regs->rdx);
- kvm_rsi_write_raw(vcpu, regs->rsi);
- kvm_rdi_write_raw(vcpu, regs->rdi);
- kvm_rsp_write(vcpu, regs->rsp);
- kvm_rbp_write_raw(vcpu, regs->rbp);
-#ifdef CONFIG_X86_64
- kvm_r8_write_raw(vcpu, regs->r8);
- kvm_r9_write_raw(vcpu, regs->r9);
- kvm_r10_write_raw(vcpu, regs->r10);
- kvm_r11_write_raw(vcpu, regs->r11);
- kvm_r12_write_raw(vcpu, regs->r12);
- kvm_r13_write_raw(vcpu, regs->r13);
- kvm_r14_write_raw(vcpu, regs->r14);
- kvm_r15_write_raw(vcpu, regs->r15);
-#endif
-
- kvm_rip_write(vcpu, regs->rip);
- kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
-
- vcpu->arch.exception.pending = false;
- vcpu->arch.exception_vmexit.pending = false;
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-
-int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
-{
- if (vcpu->kvm->arch.has_protected_state &&
- vcpu->arch.guest_state_protected)
- return -EINVAL;
-
- vcpu_load(vcpu);
- __set_regs(vcpu, regs);
- vcpu_put(vcpu);
- return 0;
-}
-
-static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
-{
- struct desc_ptr dt;
-
- if (vcpu->arch.guest_state_protected)
- goto skip_protected_regs;
-
- kvm_handle_exception_payload_quirk(vcpu);
-
- kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- kvm_x86_call(get_idt)(vcpu, &dt);
- sregs->idt.limit = dt.size;
- sregs->idt.base = dt.address;
- kvm_x86_call(get_gdt)(vcpu, &dt);
- sregs->gdt.limit = dt.size;
- sregs->gdt.base = dt.address;
-
- sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = kvm_read_cr3(vcpu);
-
-skip_protected_regs:
- sregs->cr0 = kvm_read_cr0(vcpu);
- sregs->cr4 = kvm_read_cr4(vcpu);
- sregs->cr8 = kvm_get_cr8(vcpu);
- sregs->efer = vcpu->arch.efer;
- sregs->apic_base = vcpu->arch.apic_base;
-}
-
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
-{
- __get_sregs_common(vcpu, sregs);
-
- if (vcpu->arch.guest_state_protected)
- return;
-
- if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
- set_bit(vcpu->arch.interrupt.nr,
- (unsigned long *)sregs->interrupt_bitmap);
-}
-
-static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
-{
- int i;
-
- __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
-
- if (vcpu->arch.guest_state_protected)
- return;
-
- if (is_pae_paging(vcpu)) {
- kvm_vcpu_srcu_read_lock(vcpu);
- for (i = 0 ; i < 4 ; i++)
- sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
- sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
- kvm_vcpu_srcu_read_unlock(vcpu);
- }
-}
-
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- if (vcpu->kvm->arch.has_protected_state &&
- vcpu->arch.guest_state_protected)
- return -EINVAL;
-
- vcpu_load(vcpu);
- __get_sregs(vcpu, sregs);
- vcpu_put(vcpu);
- return 0;
-}
-
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
@@ -12365,173 +9059,6 @@ unhandled_task_switch:
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch);
-static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
-{
- if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
- /*
- * When EFER.LME and CR0.PG are set, the processor is in
- * 64-bit mode (though maybe in a 32-bit code segment).
- * CR4.PAE and EFER.LMA must be set.
- */
- if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
- return false;
- if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
- return false;
- } else {
- /*
- * Not in 64-bit mode: EFER.LMA is clear and the code
- * segment cannot be 64-bit.
- */
- if (sregs->efer & EFER_LMA || sregs->cs.l)
- return false;
- }
-
- return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
- kvm_is_valid_cr0(vcpu, sregs->cr0);
-}
-
-static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
- int *mmu_reset_needed, bool update_pdptrs)
-{
- int idx;
- struct desc_ptr dt;
-
- if (!kvm_is_valid_sregs(vcpu, sregs))
- return -EINVAL;
-
- if (kvm_apic_set_base(vcpu, sregs->apic_base, true))
- return -EINVAL;
-
- if (vcpu->arch.guest_state_protected)
- return 0;
-
- dt.size = sregs->idt.limit;
- dt.address = sregs->idt.base;
- kvm_x86_call(set_idt)(vcpu, &dt);
- dt.size = sregs->gdt.limit;
- dt.address = sregs->gdt.base;
- kvm_x86_call(set_gdt)(vcpu, &dt);
-
- vcpu->arch.cr2 = sregs->cr2;
- *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
- vcpu->arch.cr3 = sregs->cr3;
- kvm_register_mark_dirty(vcpu, VCPU_REG_CR3);
- kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
-
- *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_call(set_efer)(vcpu, sregs->efer);
-
- *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
-
- *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
-
- if (update_pdptrs) {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (is_pae_paging(vcpu)) {
- load_pdptrs(vcpu, kvm_read_cr3(vcpu));
- *mmu_reset_needed = 1;
- }
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- }
-
- kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
- kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
- kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
- kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
- kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
- kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
- kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
- kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
- kvm_set_cr8(vcpu, sregs->cr8);
-
- /* Older userspace won't unhalt the vcpu on reset. */
- if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
- sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
- !is_protmode(vcpu))
- kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
-
- return 0;
-}
-
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
-{
- int pending_vec, max_bits;
- int mmu_reset_needed = 0;
- int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
-
- if (ret)
- return ret;
-
- if (mmu_reset_needed) {
- kvm_mmu_reset_context(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
- }
-
- max_bits = KVM_NR_INTERRUPTS;
- pending_vec = find_first_bit(
- (const unsigned long *)sregs->interrupt_bitmap, max_bits);
-
- if (pending_vec < max_bits) {
- kvm_queue_interrupt(vcpu, pending_vec, false);
- pr_debug("Set back pending irq %d\n", pending_vec);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
- return 0;
-}
-
-static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
-{
- int mmu_reset_needed = 0;
- bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
- bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
- !(sregs2->efer & EFER_LMA);
- int i, ret;
-
- if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
- return -EINVAL;
-
- if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
- return -EINVAL;
-
- ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
- &mmu_reset_needed, !valid_pdptrs);
- if (ret)
- return ret;
-
- if (valid_pdptrs) {
- for (i = 0; i < 4 ; i++)
- kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
-
- kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
- mmu_reset_needed = 1;
- vcpu->arch.pdptrs_from_userspace = true;
- }
- if (mmu_reset_needed) {
- kvm_mmu_reset_context(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
- }
- return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
- struct kvm_sregs *sregs)
-{
- int ret;
-
- if (vcpu->kvm->arch.has_protected_state &&
- vcpu->arch.guest_state_protected)
- return -EINVAL;
-
- vcpu_load(vcpu);
- ret = __set_sregs(vcpu, sregs);
- vcpu_put(vcpu);
- return ret;
-}
-
static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
{
bool set = false;
@@ -12687,13 +9214,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
static void store_regs(struct kvm_vcpu *vcpu)
{
- BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
-
- if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
- __get_regs(vcpu, &vcpu->run->s.regs.regs);
-
- if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
- __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+ kvm_run_sync_regs_to_user(vcpu);
if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
kvm_vcpu_ioctl_x86_get_vcpu_events(
@@ -12702,19 +9223,8 @@ static void store_regs(struct kvm_vcpu *vcpu)
static int sync_regs(struct kvm_vcpu *vcpu)
{
- if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
- __set_regs(vcpu, &vcpu->run->s.regs.regs);
- vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
- }
-
- if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
- struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
-
- if (__set_sregs(vcpu, &sregs))
- return -EINVAL;
-
- vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
- }
+ if (kvm_run_sync_regs_from_user(vcpu))
+ return -EINVAL;
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
struct kvm_vcpu_events events = vcpu->run->s.regs.events;
@@ -13457,13 +9967,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (kvm->arch.created_mediated_pmu)
perf_release_mediated_pmu();
kvm_destroy_vcpus(kvm);
- kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+ kvm_free_msr_filter((void * __force)kvm->arch.msr_filter);
#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
- kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
+ kfree((void * __force)kvm->arch.pmu_event_filter);
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
kvm_xen_destroy_vm(kvm);
@@ -13811,56 +10321,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
}
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_call(interrupt_allowed)(vcpu, false);
-}
-
-unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
-{
- /* Can't read the RIP when guest state is protected, just return 0 */
- if (vcpu->arch.guest_state_protected)
- return 0;
-
- if (is_64_bit_mode(vcpu))
- return kvm_rip_read(vcpu);
- return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
- kvm_rip_read(vcpu));
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip);
-
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
-{
- return kvm_get_linear_rip(vcpu) == linear_rip;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip);
-
-unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
-{
- unsigned long rflags;
-
- rflags = kvm_x86_call(get_rflags)(vcpu);
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- rflags &= ~X86_EFLAGS_TF;
- return rflags;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags);
-
-static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
- rflags |= X86_EFLAGS_TF;
- kvm_x86_call(set_rflags)(vcpu, rflags);
-}
-
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- __kvm_set_rflags(vcpu, rflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags);
-
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
@@ -13996,7 +10456,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
* If interrupts are off we cannot even use an artificial
* halt state.
*/
- return kvm_arch_interrupt_allowed(vcpu);
+ return kvm_is_interrupt_allowed(vcpu);
}
bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
@@ -14139,43 +10599,17 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
#endif
#endif
-int kvm_spec_ctrl_test_value(u64 value)
-{
- /*
- * test that setting IA32_SPEC_CTRL to given value
- * is allowed by the host processor
- */
-
- u64 saved_value;
- unsigned long flags;
- int ret = 0;
-
- local_irq_save(flags);
-
- if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value))
- ret = 1;
- else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value))
- ret = 1;
- else
- wrmsrq(MSR_IA32_SPEC_CTRL, saved_value);
-
- local_irq_restore(flags);
-
- return ret;
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value);
-
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
struct x86_exception fault;
u64 access = error_code &
(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
- mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
+ gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, &fault) != INVALID_GPA) {
/*
- * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * If gva_walk->gva_to_gpa succeeded, the page
* tables probably do not match the TLB. Just proceed
* with the error code that the processor gave.
*/
@@ -14186,7 +10620,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
fault.address = gva;
fault.async_page_fault = false;
}
- vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault, true);
+ gva_walk->inject_page_fault(vcpu, &fault, true);
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9de577ef9c97..8ece468087a8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -6,53 +6,16 @@
#include <asm/fpu/xstate.h>
#include <asm/mce.h>
#include <asm/pvclock.h>
+#include "msrs.h"
+#include "mmu.h"
#include "regs.h"
#include "kvm_emulate.h"
#include "cpuid.h"
#define KVM_MAX_MCE_BANKS 32
-struct kvm_caps {
- /* control of guest tsc rate supported? */
- bool has_tsc_control;
- /* maximum supported tsc_khz for guests */
- u32 max_guest_tsc_khz;
- /* number of bits of the fractional part of the TSC scaling ratio */
- u8 tsc_scaling_ratio_frac_bits;
- /* maximum allowed value of TSC scaling ratio */
- u64 max_tsc_scaling_ratio;
- /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */
- u64 default_tsc_scaling_ratio;
- /* bus lock detection supported? */
- bool has_bus_lock_exit;
- /* notify VM exit supported? */
- bool has_notify_vmexit;
- /* bit mask of VM types */
- u32 supported_vm_types;
-
- u64 supported_mce_cap;
- u64 supported_xcr0;
- u64 supported_xss;
- u64 supported_perf_cap;
-
- u64 supported_quirks;
- u64 inapplicable_quirks;
-};
-
-struct kvm_host_values {
- /*
- * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
- * address bits irrespective of features that repurpose legal bits,
- * e.g. MKTME.
- */
- u8 maxphyaddr;
-
- u64 efer;
- u64 xcr0;
- u64 xss;
- u64 s_cet;
- u64 arch_capabilities;
-};
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
void kvm_spurious_fault(void);
@@ -86,14 +49,6 @@ do { \
failed; \
})
-/*
- * The first...last VMX feature MSRs that are emulated by KVM. This may or may
- * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
- * associated feature that KVM supports for nested virtualization.
- */
-#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
-#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
-
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
@@ -102,16 +57,6 @@ do { \
#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
-/*
- * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves
- * are arbitrary and have no meaning, the only requirement is that they don't
- * conflict with "real" MSRs that KVM supports. Use values at the upper end
- * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values
- * will be usable until KVM exhausts its supply of paravirtual MSR indices.
- */
-
-#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff
-
static inline unsigned int __grow_ple_window(unsigned int val,
unsigned int base, unsigned int modifier, unsigned int max)
{
@@ -142,9 +87,6 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
return max(val, min);
}
-#define MSR_IA32_CR_PAT_DEFAULT \
- PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC)
-
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
@@ -252,11 +194,6 @@ static inline bool x86_exception_has_error_code(unsigned int vector)
return (1U << vector) & exception_has_error_code;
}
-static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
-}
-
static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
{
return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
@@ -384,6 +321,8 @@ static __always_inline void kvm_request_l1tf_flush_l1d(void)
#endif
}
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -391,6 +330,29 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
int kvm_guest_time_update(struct kvm_vcpu *v);
+void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value);
+u64 kvm_scale_tsc(u64 tsc, u64 ratio);
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
+u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc);
+void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset);
+
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.l1_tsc_offset + adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc((u64) adjustment,
+ vcpu->arch.l1_tsc_scaling_ratio);
+ adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
@@ -403,21 +365,310 @@ int handle_ud(struct kvm_vcpu *vcpu);
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
struct kvm_queued_exception *ex);
+void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu);
-int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
void *insn, int insn_len);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
-fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu);
-fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
+/*
+ * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
+ * userspace I/O) to indicate that the emulation context
+ * should be reused as is, i.e. skip initialization of
+ * emulation context, instruction fetch and decode.
+ *
+ * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
+ * Indicates that only select instructions (tagged with
+ * EmulateOnUD) should be emulated (to minimize the emulator
+ * attack surface). See also EMULTYPE_TRAP_UD_FORCED.
+ *
+ * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
+ * decode the instruction length. For use *only* by
+ * kvm_x86_ops.skip_emulated_instruction() implementations if
+ * EMULTYPE_COMPLETE_USER_EXIT is not set.
+ *
+ * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
+ * retry native execution under certain conditions,
+ * Can only be set in conjunction with EMULTYPE_PF.
+ *
+ * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
+ * triggered by KVM's magic "force emulation" prefix,
+ * which is opt in via module param (off by default).
+ * Bypasses EmulateOnUD restriction despite emulating
+ * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
+ * Used to test the full emulator from userspace.
+ *
+ * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * backdoor emulation, which is opt in via module param.
+ * VMware backdoor emulation handles select instructions
+ * and reinjects the #GP for all other cases.
+ *
+ * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case
+ * the CR2/GPA value pass on the stack is valid.
+ *
+ * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
+ * state and inject single-step #DBs after skipping
+ * an instruction (after completing userspace I/O).
+ *
+ * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
+ * is attempting to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself,
+ * and the owning page table is being shadowed by KVM.
+ * If emulation of the faulting instruction fails and
+ * this flag is set, KVM will exit to userspace instead
+ * of retrying emulation as KVM cannot make forward
+ * progress.
+ *
+ * If emulation fails for a write to guest page tables,
+ * KVM unprotects (zaps) the shadow page for the target
+ * gfn and resumes the guest to retry the non-emulatable
+ * instruction (on hardware). Unprotecting the gfn
+ * doesn't allow forward progress for a self-changing
+ * access because doing so also zaps the translation for
+ * the gfn, i.e. retrying the instruction will hit a
+ * !PRESENT fault, which results in a new shadow page
+ * and sends KVM back to square one.
+ *
+ * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip
+ * an instruction if it could generate a given software
+ * interrupt, which must be encoded via
+ * EMULTYPE_SET_SOFT_INT_VECTOR().
+ */
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
+#define EMULTYPE_SKIP (1 << 2)
+#define EMULTYPE_ALLOW_RETRY_PF (1 << 3)
+#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
+#define EMULTYPE_VMWARE_GP (1 << 5)
+#define EMULTYPE_PF (1 << 6)
+#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
+#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
+#define EMULTYPE_SKIP_SOFT_INT (1 << 9)
+
+#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16)
+#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff)
+
+static inline bool kvm_can_emulate_event_vectoring(int emul_type)
+{
+ return !(emul_type & EMULTYPE_PF);
+}
+
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len);
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
+ u64 *data, u8 ndata);
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
+
+void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa);
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason);
+
fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu);
-extern struct kvm_caps kvm_caps;
-extern struct kvm_host_values kvm_host;
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
+
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code);
+
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error_code, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault,
+ bool from_hardware);
+void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault,
+ bool from_hardware);
+
+static inline void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ __kvm_inject_emulated_page_fault(vcpu, fault, false);
+}
+
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
+
+void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+ u32 size);
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
+
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+
+enum kvm_apicv_inhibit {
+
+ /********************************************************************/
+ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
+ /********************************************************************/
+
+ /*
+ * APIC acceleration is disabled by a module parameter
+ * and/or not supported in hardware.
+ */
+ APICV_INHIBIT_REASON_DISABLED,
+
+ /*
+ * APIC acceleration is inhibited because AutoEOI feature is
+ * being used by a HyperV guest.
+ */
+ APICV_INHIBIT_REASON_HYPERV,
+
+ /*
+ * APIC acceleration is inhibited because the userspace didn't yet
+ * enable the kernel/split irqchip.
+ */
+ APICV_INHIBIT_REASON_ABSENT,
+
+ /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
+ * (out of band, debug measure of blocking all interrupts on this vCPU)
+ * was enabled, to avoid AVIC/APICv bypassing it.
+ */
+ APICV_INHIBIT_REASON_BLOCKIRQ,
+
+ /*
+ * APICv is disabled because not all vCPUs have a 1:1 mapping between
+ * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+
+ /*
+ * For simplicity, the APIC acceleration is inhibited
+ * first time either APIC ID or APIC base are changed by the guest
+ * from their reset values.
+ */
+ APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
+
+ /******************************************************/
+ /* INHIBITs that are relevant only to the AMD's AVIC. */
+ /******************************************************/
+
+ /*
+ * AVIC is inhibited on a vCPU because it runs a nested guest.
+ *
+ * This is needed because unlike APICv, the peers of this vCPU
+ * cannot use the doorbell mechanism to signal interrupts via AVIC when
+ * a vCPU runs nested.
+ */
+ APICV_INHIBIT_REASON_NESTED,
+
+ /*
+ * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
+ * which cannot be injected when the AVIC is enabled, thus AVIC
+ * is inhibited while KVM waits for IRQ window.
+ */
+ APICV_INHIBIT_REASON_IRQWIN,
+
+ /*
+ * PIT (i8254) 're-inject' mode, relies on EOI intercept,
+ * which AVIC doesn't support for edge triggered interrupts.
+ */
+ APICV_INHIBIT_REASON_PIT_REINJ,
+
+ /*
+ * AVIC is disabled because SEV doesn't support it.
+ */
+ APICV_INHIBIT_REASON_SEV,
+
+ /*
+ * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+ * mapping between logical ID and vCPU.
+ */
+ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+
+ /*
+ * AVIC is disabled because the vCPU's APIC ID is beyond the max
+ * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
+
+ NR_APICV_INHIBIT_REASONS,
+};
+
+#define __APICV_INHIBIT_REASON(reason) \
+ { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+
+#define APICV_INHIBIT_REASONS \
+ __APICV_INHIBIT_REASON(DISABLED), \
+ __APICV_INHIBIT_REASON(HYPERV), \
+ __APICV_INHIBIT_REASON(ABSENT), \
+ __APICV_INHIBIT_REASON(BLOCKIRQ), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
+ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
+ __APICV_INHIBIT_REASON(NESTED), \
+ __APICV_INHIBIT_REASON(IRQWIN), \
+ __APICV_INHIBIT_REASON(PIT_REINJ), \
+ __APICV_INHIBIT_REASON(SEV), \
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
+
+bool kvm_apicv_activated(struct kvm *kvm);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set);
+void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason, bool set);
+
+static inline void kvm_set_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason)
+{
+ kvm_set_or_clear_apicv_inhibit(kvm, reason, true);
+}
+
+static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
+ enum kvm_apicv_inhibit reason)
+{
+ kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
+}
+
+void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc);
+
+static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm)
+{
+ kvm_inc_or_dec_irq_window_inhibit(kvm, true);
+}
+
+static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm)
+{
+ kvm_inc_or_dec_irq_window_inhibit(kvm, false);
+}
+
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap);
void kvm_setup_xss_caps(void);
@@ -461,22 +712,6 @@ extern bool enable_vmware_backdoor;
extern int pi_inject_timer;
-extern bool report_ignored_msrs;
-
-extern bool eager_page_split;
-
-static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
-}
-
-static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
-{
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
-}
-
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -575,6 +810,8 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apf.gfns[i] = ~0;
}
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
/*
* Trigger machine check on the host. We assume all the MSRs are already set up
* by the CPU and that we still run on the same CPU as the MCE occurred on.
@@ -594,32 +831,10 @@ static inline void kvm_machine_check(void)
#endif
}
-int kvm_spec_ctrl_test_value(u64 value);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
+void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
-bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
-
-enum kvm_msr_access {
- MSR_TYPE_R = BIT(0),
- MSR_TYPE_W = BIT(1),
- MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W,
-};
-
-/*
- * Internal error codes that are used to indicate that MSR emulation encountered
- * an error that should result in #GP in the guest, unless userspace handles it.
- * Note, '1', '0', and negative numbers are off limits, as they are used by KVM
- * as part of KVM's lightly documented internal KVM_RUN return codes.
- *
- * UNSUPPORTED - The MSR isn't supported, either because it is completely
- * unknown to KVM, or because the MSR should not exist according
- * to the vCPU model.
- *
- * FILTERED - Access to the MSR is denied by a userspace MSR filter.
- */
-#define KVM_MSR_RET_UNSUPPORTED 2
-#define KVM_MSR_RET_FILTERED 3
int kvm_sev_es_mmio(struct kvm_vcpu *vcpu, bool is_write, gpa_t gpa,
unsigned int bytes, void *data);
@@ -679,27 +894,4 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-#define CET_US_RESERVED_BITS GENMASK(9, 6)
-#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0)
-#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10))
-#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12)
-
-static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data)
-{
- if (data & CET_US_RESERVED_BITS)
- return false;
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
- (data & CET_US_SHSTK_MASK_BITS))
- return false;
- if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
- (data & CET_US_IBT_MASK_BITS))
- return false;
- if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4))
- return false;
- /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */
- if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR))
- return false;
-
- return true;
-}
#endif
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index d28a057fa6c2..4ace12606e93 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -120,6 +120,7 @@ TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test
TEST_GEN_PROGS_x86 += x86/svm_nested_vmcb12_gpa
TEST_GEN_PROGS_x86 += x86/svm_nested_pat_test
TEST_GEN_PROGS_x86 += x86/svm_lbr_nested_state
+TEST_GEN_PROGS_x86 += x86/svm_pmu_host_guest_test
TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync
TEST_GEN_PROGS_x86 += x86/sync_regs_test
TEST_GEN_PROGS_x86 += x86/ucna_injection_test
diff --git a/tools/testing/selftests/kvm/include/x86/pmu.h b/tools/testing/selftests/kvm/include/x86/pmu.h
index 98537cc8840d..608ed83d7c6a 100644
--- a/tools/testing/selftests/kvm/include/x86/pmu.h
+++ b/tools/testing/selftests/kvm/include/x86/pmu.h
@@ -38,6 +38,12 @@
#define ARCH_PERFMON_EVENTSEL_INV BIT_ULL(23)
#define ARCH_PERFMON_EVENTSEL_CMASK GENMASK_ULL(31, 24)
+/*
+ * These are AMD-specific bits.
+ */
+#define AMD64_EVENTSEL_GUESTONLY BIT_ULL(40)
+#define AMD64_EVENTSEL_HOSTONLY BIT_ULL(41)
+
/* RDPMC control flags, Intel only. */
#define INTEL_RDPMC_METRICS BIT_ULL(29)
#define INTEL_RDPMC_FIXED BIT_ULL(30)
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 513e4a1075fa..7d3a27bc0d84 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1226,6 +1226,8 @@ struct idt_entry {
void vm_install_exception_handler(struct kvm_vm *vm, int vector,
void (*handler)(struct ex_regs *));
+gva_t vm_alloc_stack(struct kvm_vm *vm, int nr_pages);
+
/*
* Exception fixup morphs #DE to an arbitrary magic vector so that '0' can be
* used to signal "no expcetion".
@@ -1392,6 +1394,14 @@ static inline bool kvm_is_pmu_enabled(void)
return get_kvm_param_bool("enable_pmu");
}
+static inline bool kvm_is_mediated_pmu_enabled(void)
+{
+ if (host_cpu_is_intel)
+ return get_kvm_intel_param_bool("enable_mediated_pmu");
+
+ return get_kvm_amd_param_bool("enable_mediated_pmu");
+}
+
static inline bool kvm_is_forced_emulation_enabled(void)
{
return !!get_kvm_param_integer("force_emulation_prefix");
diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h
index 6c013eb838be..c201c30485e7 100644
--- a/tools/testing/selftests/kvm/include/x86/svm_util.h
+++ b/tools/testing/selftests/kvm/include/x86/svm_util.h
@@ -28,6 +28,9 @@ struct svm_test_data {
void *msr_hva;
u64 msr_gpa;
+ /* Stack */
+ void *stack; /* gva */
+
/* NPT */
u64 ncr3_gpa;
};
@@ -57,7 +60,7 @@ static inline void vmmcall(void)
)
struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, gva_t *p_svm_gva);
-void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp);
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip);
void run_guest(struct vmcb *vmcb, u64 vmcb_gpa);
static inline bool kvm_cpu_has_npt(void)
diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
index 90fffaf91595..4bcfd60e3aec 100644
--- a/tools/testing/selftests/kvm/include/x86/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -524,6 +524,8 @@ struct vmx_pages {
u64 apic_access_gpa;
void *apic_access;
+ void *stack;
+
u64 eptp_gpa;
};
@@ -552,7 +554,7 @@ union vmx_ctrl_msr {
struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, gva_t *p_vmx_gva);
bool prepare_for_vmx_operation(struct vmx_pages *vmx);
-void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip);
bool load_vmcs(struct vmx_pages *vmx);
bool ept_1g_pages_supported(void);
diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 61cf952cd2dc..e19e8b5a09c5 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -30,21 +30,15 @@ __asm__(
" ud2;"
);
-#define L2_GUEST_STACK_SIZE 64
-
static void l1_vmx_code(struct vmx_pages *vmx, u64 vcpu_id)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
- unsigned long *rsp;
-
GUEST_ASSERT(vmx->vmcs_gpa);
GUEST_ASSERT(prepare_for_vmx_operation(vmx));
GUEST_ASSERT(load_vmcs(vmx));
GUEST_ASSERT(ept_1g_pages_supported());
- rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
- *rsp = vcpu_id;
- prepare_vmcs(vmx, memstress_l2_guest_entry, rsp);
+ *(u64 *)vmx->stack = vcpu_id;
+ prepare_vmcs(vmx, memstress_l2_guest_entry);
GUEST_ASSERT(!vmlaunch());
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
@@ -53,13 +47,8 @@ static void l1_vmx_code(struct vmx_pages *vmx, u64 vcpu_id)
static void l1_svm_code(struct svm_test_data *svm, u64 vcpu_id)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
- unsigned long *rsp;
-
-
- rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
- *rsp = vcpu_id;
- generic_svm_setup(svm, memstress_l2_guest_entry, rsp);
+ *(u64 *)svm->stack = vcpu_id;
+ generic_svm_setup(svm, memstress_l2_guest_entry);
run_guest(svm->vmcb, svm->vmcb_gpa);
GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 4ca48de7a926..ef56dcefe011 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -778,6 +778,30 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
REPORT_GUEST_ASSERT(uc);
}
+gva_t vm_alloc_stack(struct kvm_vm *vm, int nr_pages)
+{
+ int size = nr_pages * getpagesize();
+ gva_t stack_gva;
+
+ stack_gva = __vm_alloc(vm, size, DEFAULT_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA);
+ stack_gva += size;
+
+ /*
+ * Align stack to match calling sequence requirements in section "The
+ * Stack Frame" of the System V ABI AMD64 Architecture Processor
+ * Supplement, which requires the value (%rsp + 8) to be a multiple of
+ * 16 when control is transferred to the function entry point.
+ *
+ * If this code is ever used to launch a vCPU with 32-bit entry point it
+ * may need to subtract 4 bytes instead of 8 bytes.
+ */
+ TEST_ASSERT(IS_ALIGNED(stack_gva, PAGE_SIZE),
+ "__vm_alloc() did not provide a page-aligned address");
+ stack_gva -= 8;
+
+ return stack_gva;
+}
+
void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
{
int r;
@@ -820,27 +844,8 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, u32 vcpu_id)
{
struct kvm_mp_state mp_state;
struct kvm_regs regs;
- gva_t stack_gva;
struct kvm_vcpu *vcpu;
- stack_gva = __vm_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
- DEFAULT_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA);
-
- stack_gva += DEFAULT_STACK_PGS * getpagesize();
-
- /*
- * Align stack to match calling sequence requirements in section "The
- * Stack Frame" of the System V ABI AMD64 Architecture Processor
- * Supplement, which requires the value (%rsp + 8) to be a multiple of
- * 16 when control is transferred to the function entry point.
- *
- * If this code is ever used to launch a vCPU with 32-bit entry point it
- * may need to subtract 4 bytes instead of 8 bytes.
- */
- TEST_ASSERT(IS_ALIGNED(stack_gva, PAGE_SIZE),
- "__vm_alloc() did not provide a page-aligned address");
- stack_gva -= 8;
-
vcpu = __vm_vcpu_add(vm, vcpu_id);
vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
vcpu_init_sregs(vm, vcpu);
@@ -849,7 +854,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, u32 vcpu_id)
/* Setup guest general purpose registers */
vcpu_regs_get(vcpu, &regs);
regs.rflags = regs.rflags | X86_EFLAGS_FIXED;
- regs.rsp = stack_gva;
+ regs.rsp = vm_alloc_stack(vm, DEFAULT_STACK_PGS);
vcpu_regs_set(vcpu, &regs);
/* Setup the MP state */
diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c
index 3b01605ab016..1445b890986f 100644
--- a/tools/testing/selftests/kvm/lib/x86/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86/svm.c
@@ -46,6 +46,8 @@ vcpu_alloc_svm(struct kvm_vm *vm, gva_t *p_svm_gva)
svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr);
memset(svm->msr_hva, 0, getpagesize());
+ svm->stack = (void *)vm_alloc_stack(vm, 1);
+
if (vm->stage2_mmu.pgd_created)
svm->ncr3_gpa = vm->stage2_mmu.pgd;
@@ -81,7 +83,7 @@ void vm_enable_npt(struct kvm_vm *vm)
tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks);
}
-void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip)
{
struct vmcb *vmcb = svm->vmcb;
u64 vmcb_gpa = svm->vmcb_gpa;
@@ -122,7 +124,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
ctrl->msrpm_base_pa = svm->msr_gpa;
vmcb->save.rip = (u64)guest_rip;
- vmcb->save.rsp = (u64)guest_rsp;
+ vmcb->save.rsp = (u64)svm->stack;
guest_regs.rdi = (u64)svm;
if (svm->ncr3_gpa) {
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index 7c10ba6e6fb4..cd09c9de4485 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -116,6 +116,8 @@ vcpu_alloc_vmx(struct kvm_vm *vm, gva_t *p_vmx_gva)
vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
memset(vmx->vmwrite_hva, 0, getpagesize());
+ vmx->stack = (void *)vm_alloc_stack(vm, 1);
+
if (vm->stage2_mmu.pgd_created)
vmx->eptp_gpa = vm->stage2_mmu.pgd;
@@ -366,11 +368,11 @@ static inline void init_vmcs_guest_state(void *rip, void *rsp)
vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
}
-void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip)
{
init_vmcs_control_fields(vmx);
init_vmcs_host_state();
- init_vmcs_guest_state(guest_rip, guest_rsp);
+ init_vmcs_guest_state(guest_rip, vmx->stack);
}
bool kvm_cpu_has_ept(void)
diff --git a/tools/testing/selftests/kvm/x86/aperfmperf_test.c b/tools/testing/selftests/kvm/x86/aperfmperf_test.c
index c91660103137..845cb685f174 100644
--- a/tools/testing/selftests/kvm/x86/aperfmperf_test.c
+++ b/tools/testing/selftests/kvm/x86/aperfmperf_test.c
@@ -54,8 +54,6 @@ static void guest_read_aperf_mperf(void)
GUEST_SYNC2(rdmsr(MSR_IA32_APERF), rdmsr(MSR_IA32_MPERF));
}
-#define L2_GUEST_STACK_SIZE 64
-
static void l2_guest_code(void)
{
guest_read_aperf_mperf();
@@ -64,21 +62,18 @@ static void l2_guest_code(void)
static void l1_svm_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
- generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
run_guest(vmcb, svm->vmcb_gpa);
}
static void l1_vmx_code(struct vmx_pages *vmx)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
GUEST_ASSERT_EQ(load_vmcs(vmx), true);
- prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx, NULL);
/*
* Enable MSR bitmaps (the bitmap itself is allocated, zeroed, and set
diff --git a/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c b/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c
index 5b3aef109cfc..77ce87c41a86 100644
--- a/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c
+++ b/tools/testing/selftests/kvm/x86/evmcs_smm_controls_test.c
@@ -52,8 +52,6 @@ static void l2_guest_code(void)
static void guest_code(struct vmx_pages *vmx_pages,
struct hyperv_test_pages *hv_pages)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
/* Set up Hyper-V enlightenments and eVMCS */
wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
@@ -62,8 +60,7 @@ static void guest_code(struct vmx_pages *vmx_pages,
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_evmcs(hv_pages));
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
GUEST_ASSERT(!vmlaunch());
diff --git a/tools/testing/selftests/kvm/x86/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86/hyperv_evmcs.c
index c7fa114aee20..1bda2cd3f739 100644
--- a/tools/testing/selftests/kvm/x86/hyperv_evmcs.c
+++ b/tools/testing/selftests/kvm/x86/hyperv_evmcs.c
@@ -78,9 +78,6 @@ void l2_guest_code(void)
void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages,
gpa_t hv_hcall_page_gpa)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa);
@@ -100,8 +97,7 @@ void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages,
GUEST_SYNC(4);
GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
GUEST_SYNC(5);
GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
diff --git a/tools/testing/selftests/kvm/x86/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86/hyperv_svm_test.c
index 7a62f6a9d606..1f74b0fa9b83 100644
--- a/tools/testing/selftests/kvm/x86/hyperv_svm_test.c
+++ b/tools/testing/selftests/kvm/x86/hyperv_svm_test.c
@@ -18,8 +18,6 @@
#include "svm_util.h"
#include "hyperv.h"
-#define L2_GUEST_STACK_SIZE 256
-
/* Exit to L1 from L2 with RDMSR instruction */
static inline void rdmsr_from_l2(u32 msr)
{
@@ -69,7 +67,6 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm,
struct hyperv_test_pages *hv_pages,
gpa_t pgs_gpa)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
@@ -81,8 +78,7 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm,
GUEST_ASSERT(svm->vmcb_gpa);
/* Prepare for L2 execution. */
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
/* L2 TLB flush setup */
hve->partition_assist_page = hv_pages->partition_assist_gpa;
diff --git a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c
index 52014a3210c8..25a182be00a9 100644
--- a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c
+++ b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c
@@ -26,8 +26,6 @@ static void guest_generate_buslocks(void)
atomic_inc(val);
}
-#define L2_GUEST_STACK_SIZE 64
-
static void l2_guest_code(void)
{
guest_generate_buslocks();
@@ -36,21 +34,18 @@ static void l2_guest_code(void)
static void l1_svm_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
- generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
run_guest(vmcb, svm->vmcb_gpa);
}
static void l1_vmx_code(struct vmx_pages *vmx)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
GUEST_ASSERT_EQ(load_vmcs(vmx), true);
- prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx, NULL);
GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code));
GUEST_ASSERT(!vmlaunch());
diff --git a/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c
index 761fec293408..b974cfb347d6 100644
--- a/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c
@@ -21,8 +21,6 @@ enum {
PORT_L0_EXIT = 0x2000,
};
-#define L2_GUEST_STACK_SIZE 64
-
static void l2_guest_code(void)
{
/* Exit to L0 */
@@ -32,14 +30,11 @@ static void l2_guest_code(void)
static void l1_vmx_code(struct vmx_pages *vmx_pages)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
/* Prepare the VMCS for L2 execution. */
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
GUEST_ASSERT(!vmlaunch());
GUEST_ASSERT(0);
@@ -47,11 +42,8 @@ static void l1_vmx_code(struct vmx_pages *vmx_pages)
static void l1_svm_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
/* Prepare the VMCB for L2 execution. */
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
run_guest(svm->vmcb, svm->vmcb_gpa);
GUEST_ASSERT(0);
diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
index 0e67cce83570..26b474bf1353 100644
--- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
@@ -40,8 +40,6 @@
#define TEST_HVA(vm, idx) addr_gpa2hva(vm, TEST_GPA(idx))
-#define L2_GUEST_STACK_SIZE 64
-
/* Use the page offset bits to communicate the access+fault type. */
#define TEST_SYNC_READ_FAULT BIT(0)
#define TEST_SYNC_WRITE_FAULT BIT(1)
@@ -92,7 +90,6 @@ static void l2_guest_code_tdp_disabled(void)
void l1_vmx_code(struct vmx_pages *vmx)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
void *l2_rip;
GUEST_ASSERT(vmx->vmcs_gpa);
@@ -104,7 +101,7 @@ void l1_vmx_code(struct vmx_pages *vmx)
else
l2_rip = l2_guest_code_tdp_disabled;
- prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx, l2_rip);
GUEST_SYNC(TEST_SYNC_NO_FAULT);
GUEST_ASSERT(!vmlaunch());
@@ -115,7 +112,6 @@ void l1_vmx_code(struct vmx_pages *vmx)
static void l1_svm_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
void *l2_rip;
if (svm->ncr3_gpa)
@@ -123,7 +119,7 @@ static void l1_svm_code(struct svm_test_data *svm)
else
l2_rip = l2_guest_code_tdp_disabled;
- generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_rip);
GUEST_SYNC(TEST_SYNC_NO_FAULT);
run_guest(svm->vmcb, svm->vmcb_gpa);
diff --git a/tools/testing/selftests/kvm/x86/nested_emulation_test.c b/tools/testing/selftests/kvm/x86/nested_emulation_test.c
index fb7dcbe53ac7..e08c6b0697e5 100644
--- a/tools/testing/selftests/kvm/x86/nested_emulation_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_emulation_test.c
@@ -57,7 +57,7 @@ static void guest_code(void *test_data)
struct svm_test_data *svm = test_data;
struct vmcb *vmcb = svm->vmcb;
- generic_svm_setup(svm, NULL, NULL);
+ generic_svm_setup(svm, NULL);
vmcb->save.idtr.limit = 0;
vmcb->save.rip = (u64)l2_guest_code;
@@ -69,7 +69,7 @@ static void guest_code(void *test_data)
GUEST_ASSERT(prepare_for_vmx_operation(test_data));
GUEST_ASSERT(load_vmcs(test_data));
- prepare_vmcs(test_data, NULL, NULL);
+ prepare_vmcs(test_data, NULL);
GUEST_ASSERT(!vmwrite(GUEST_IDTR_LIMIT, 0));
GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code));
GUEST_ASSERT(!vmwrite(EXCEPTION_BITMAP, 0));
diff --git a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c
index 186e980aa8ee..aeec3121c8e8 100644
--- a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c
@@ -5,8 +5,6 @@
#include "vmx.h"
#include "svm_util.h"
-#define L2_GUEST_STACK_SIZE 256
-
/*
* Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with
* the "real" exceptions used, #SS/#GP/#DF (12/13/8).
@@ -91,9 +89,8 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
static void l1_svm_code(struct svm_test_data *svm)
{
struct vmcb_control_area *ctrl = &svm->vmcb->control;
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
- generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, NULL);
svm->vmcb->save.idtr.limit = 0;
ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN);
@@ -128,13 +125,11 @@ static void vmx_run_l2(void *l2_code, int vector, u32 error_code)
static void l1_vmx_code(struct vmx_pages *vmx)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
GUEST_ASSERT_EQ(load_vmcs(vmx), true);
- prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx, NULL);
GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0);
/*
diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c
index 11fd2467d823..8c2ba9674558 100644
--- a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c
@@ -11,8 +11,6 @@
#include "kselftest.h"
-#define L2_GUEST_STACK_SIZE 64
-
static void l2_guest_code(void)
{
vmcall();
@@ -20,11 +18,9 @@ static void l2_guest_code(void)
static void l1_svm_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
uintptr_t save_cr3;
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
/* Try to run L2 with invalid CR3 and make sure it fails */
save_cr3 = svm->vmcb->save.cr3;
@@ -42,14 +38,12 @@ static void l1_svm_code(struct svm_test_data *svm)
static void l1_vmx_code(struct vmx_pages *vmx_pages)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
uintptr_t save_cr3;
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
/* Try to run L2 with invalid CR3 and make sure it fails */
save_cr3 = vmreadz(GUEST_CR3);
diff --git a/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c b/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c
index fa95568f55ff..2e04563790ff 100644
--- a/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c
@@ -9,8 +9,6 @@
#include "svm_util.h"
#include "vmx.h"
-#define L2_GUEST_STACK_SIZE 64
-
enum test_type {
TEST_FINAL_PAGE_UNMAPPED, /* Final data page not present */
TEST_PT_PAGE_UNMAPPED, /* Page table page not present */
@@ -54,14 +52,13 @@ static void l2_guest_code_ins(void)
static void l1_vmx_code(struct vmx_pages *vmx, u64 expected_fault_gpa,
u64 test_type)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u64 exit_qual;
GUEST_ASSERT(vmx->vmcs_gpa);
GUEST_ASSERT(prepare_for_vmx_operation(vmx));
GUEST_ASSERT(load_vmcs(vmx));
- prepare_vmcs(vmx, l2_entry, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx, l2_entry);
GUEST_ASSERT(!vmlaunch());
@@ -120,12 +117,10 @@ static void l1_vmx_code(struct vmx_pages *vmx, u64 expected_fault_gpa,
static void l1_svm_code(struct svm_test_data *svm, u64 expected_fault_gpa,
u64 test_type)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
u64 exit_info_1;
- generic_svm_setup(svm, l2_entry,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_entry);
run_guest(vmcb, svm->vmcb_gpa);
diff --git a/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c
index f0e4adac4751..cb79d7b9619c 100644
--- a/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c
@@ -34,8 +34,6 @@
#define TSC_ADJUST_VALUE (1ll << 32)
#define TSC_OFFSET_VALUE -(1ll << 48)
-#define L2_GUEST_STACK_SIZE 64
-
enum {
PORT_ABORT = 0x1000,
PORT_REPORT,
@@ -75,8 +73,6 @@ static void l2_guest_code(void)
static void l1_guest_code(void *data)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
/* Set TSC from L1 and make sure TSC_ADJUST is updated correctly */
GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
@@ -93,8 +89,7 @@ static void l1_guest_code(void *data)
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
@@ -105,8 +100,7 @@ static void l1_guest_code(void *data)
} else {
struct svm_test_data *svm = data;
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
svm->vmcb->control.tsc_offset = TSC_OFFSET_VALUE;
run_guest(svm->vmcb, svm->vmcb_gpa);
diff --git a/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c
index 190e93af20a1..18f765835bf4 100644
--- a/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c
@@ -22,8 +22,6 @@
#define TSC_OFFSET_L2 ((u64)-33125236320908)
#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
-#define L2_GUEST_STACK_SIZE 64
-
enum { USLEEP, UCHECK_L1, UCHECK_L2 };
#define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec)
#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
@@ -82,13 +80,10 @@ static void l2_guest_code(void)
static void l1_svm_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
/* check that L1's frequency looks alright before launching L2 */
check_tsc_freq(UCHECK_L1);
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
/* enable TSC scaling for L2 */
wrmsr(MSR_AMD64_TSC_RATIO, L2_SCALE_FACTOR << 32);
@@ -105,7 +100,6 @@ static void l1_svm_code(struct svm_test_data *svm)
static void l1_vmx_code(struct vmx_pages *vmx_pages)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u32 control;
/* check that L1's frequency looks alright before launching L2 */
@@ -115,7 +109,7 @@ static void l1_vmx_code(struct vmx_pages *vmx_pages)
GUEST_ASSERT(load_vmcs(vmx_pages));
/* prepare the VMCS for L2 execution */
- prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
/* enable TSC offsetting and TSC scaling for L2 */
control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
index 85d3f4cc76f3..a130759f39a1 100644
--- a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
@@ -28,8 +28,6 @@
#define TEST_VMCB_L2_GPA TEST_VMCB_L1_GPA(0)
-#define L2_GUEST_STACK_SIZE 64
-
static void l2_guest_code_vmsave(void)
{
asm volatile("vmsave %0" : : "a"(TEST_VMCB_L2_GPA) : "memory");
@@ -70,10 +68,8 @@ static void l2_guest_code_vmcb1(void)
static void l1_guest_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
/* Each test case initializes the guest RIP below */
- generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, NULL);
/* Set VMSAVE/VMLOAD intercepts and make sure they work with.. */
svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) |
diff --git a/tools/testing/selftests/kvm/x86/smm_test.c b/tools/testing/selftests/kvm/x86/smm_test.c
index 740051167dbd..e2542f4ced60 100644
--- a/tools/testing/selftests/kvm/x86/smm_test.c
+++ b/tools/testing/selftests/kvm/x86/smm_test.c
@@ -63,8 +63,6 @@ static void l2_guest_code(void)
static void guest_code(void *arg)
{
- #define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u64 apicbase = rdmsr(MSR_IA32_APICBASE);
struct svm_test_data *svm = arg;
struct vmx_pages *vmx_pages = arg;
@@ -81,13 +79,11 @@ static void guest_code(void *arg)
if (arg) {
if (this_cpu_has(X86_FEATURE_SVM)) {
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
} else {
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
}
sync_with_host(5);
diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c
index 409c6cc9f921..4a1056a6cb8d 100644
--- a/tools/testing/selftests/kvm/x86/state_test.c
+++ b/tools/testing/selftests/kvm/x86/state_test.c
@@ -19,8 +19,6 @@
#include "vmx.h"
#include "svm_util.h"
-#define L2_GUEST_STACK_SIZE 256
-
void svm_l2_guest_code(void)
{
GUEST_SYNC(4);
@@ -35,13 +33,11 @@ void svm_l2_guest_code(void)
static void svm_l1_guest_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
GUEST_ASSERT(svm->vmcb_gpa);
/* Prepare for L2 execution. */
- generic_svm_setup(svm, svm_l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, svm_l2_guest_code);
vmcb->control.int_ctl |= (V_GIF_ENABLE_MASK | V_GIF_MASK);
@@ -78,8 +74,6 @@ void vmx_l2_guest_code(void)
static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
GUEST_ASSERT(vmx_pages->vmcs_gpa);
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_SYNC(3);
@@ -89,8 +83,7 @@ static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
GUEST_SYNC(4);
GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
- prepare_vmcs(vmx_pages, vmx_l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, vmx_l2_guest_code);
GUEST_SYNC(5);
GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
diff --git a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c
index d3cc5e4f7883..7b1f4a4818bd 100644
--- a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c
@@ -54,15 +54,12 @@ static void l2_guest_code(struct svm_test_data *svm)
static void l1_guest_code(struct svm_test_data *svm)
{
- #define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
x2apic_enable();
/* Prepare for L2 execution. */
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
/* No virtual interrupt masking */
vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
diff --git a/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c
index 7fbfaa054c95..77c6ce9f4507 100644
--- a/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c
+++ b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c
@@ -9,8 +9,6 @@
#include "svm_util.h"
-#define L2_GUEST_STACK_SIZE 64
-
#define DO_BRANCH() do { asm volatile("jmp 1f\n 1: nop"); } while (0)
struct lbr_branch {
@@ -55,7 +53,6 @@ static void l2_guest_code(struct svm_test_data *svm)
static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
struct lbr_branch l1_branch;
@@ -65,8 +62,7 @@ static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv)
CHECK_BRANCH_MSRS(&l1_branch);
/* Run L2, which will also do the same */
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
if (nested_lbrv)
vmcb->control.misc_ctl2 = SVM_MISC2_ENABLE_V_LBR;
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c
index 6a89eaffc657..6bc301207cbc 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c
@@ -8,8 +8,6 @@
#include "kselftest.h"
-#define L2_GUEST_STACK_SIZE 64
-
static void l2_guest_code(void)
{
unsigned long efer = rdmsr(MSR_EFER);
@@ -24,10 +22,7 @@ static void l2_guest_code(void)
static void l1_guest_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
run_guest(svm->vmcb, svm->vmcb_gpa);
/* Unreachable, L1 should be shutdown */
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c b/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c
index 92da8ff34da1..14ec9d6ad195 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c
@@ -25,8 +25,6 @@
#include "processor.h"
#include "svm_util.h"
-#define L2_GUEST_STACK_SIZE 256
-
#define PAT_DEFAULT 0x0007040600070406ULL
#define L1_PAT_VALUE 0x0007040600070404ULL /* Change PA0 to WT */
#define L2_VMCB12_PAT 0x0606060606060606ULL /* All WB */
@@ -59,14 +57,13 @@ static void l2_guest_code(void)
static void l1_guest_code(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
int i;
wrmsr(MSR_IA32_CR_PAT, L1_PAT_VALUE);
GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L1_PAT_VALUE);
- generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
vmcb->save.g_pat = L2_VMCB12_PAT;
vmcb->control.intercept &= ~(1ULL << INTERCEPT_MSR_PROT);
@@ -94,11 +91,10 @@ static void l1_guest_code(struct svm_test_data *svm)
static void l1_guest_code_invalid_gpat(struct svm_test_data *svm)
{
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
/* VMRUN should fail without running L2 */
- generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, NULL);
vmcb->save.g_pat = INVALID_PAT_VALUE;
run_guest(vmcb, svm->vmcb_gpa);
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c
index c6ea3d609a62..2a4a216954bb 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_shutdown_test.c
@@ -19,12 +19,9 @@ static void l2_guest_code(struct svm_test_data *svm)
static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt)
{
- #define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
index f72f11d4c4f8..0b640d09d194 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
@@ -78,17 +78,13 @@ static void l2_guest_code_nmi(void)
static void l1_guest_code(struct svm_test_data *svm, u64 is_nmi, u64 idt_alt)
{
- #define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
if (is_nmi)
x2apic_enable();
/* Prepare for L2 execution. */
- generic_svm_setup(svm,
- is_nmi ? l2_guest_code_nmi : l2_guest_code_int,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, is_nmi ? l2_guest_code_nmi : l2_guest_code_int);
vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR);
vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT);
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c
index a4935ce2fb99..b3f45035745f 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c
@@ -9,14 +9,9 @@
#include "kvm_test_harness.h"
#include "test_util.h"
-
-#define L2_GUEST_STACK_SIZE 64
-
#define SYNC_GP 101
#define SYNC_L2_STARTED 102
-static unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
static void guest_gp_handler(struct ex_regs *regs)
{
GUEST_SYNC(SYNC_GP);
@@ -30,28 +25,28 @@ static void l2_code(void)
static void l1_vmrun(struct svm_test_data *svm, gpa_t gpa)
{
- generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_code);
asm volatile ("vmrun %[gpa]" : : [gpa] "a" (gpa) : "memory");
}
static void l1_vmload(struct svm_test_data *svm, gpa_t gpa)
{
- generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_code);
asm volatile ("vmload %[gpa]" : : [gpa] "a" (gpa) : "memory");
}
static void l1_vmsave(struct svm_test_data *svm, gpa_t gpa)
{
- generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_code);
asm volatile ("vmsave %[gpa]" : : [gpa] "a" (gpa) : "memory");
}
static void l1_vmexit(struct svm_test_data *svm, gpa_t gpa)
{
- generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_code);
run_guest(svm->vmcb, svm->vmcb_gpa);
GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL);
diff --git a/tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c b/tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c
new file mode 100644
index 000000000000..c5b5cd788d93
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/svm_pmu_host_guest_test.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM nested SVM PMU Host-Only/Guest-Only test
+ *
+ * Copyright (C) 2026, Google LLC.
+ *
+ * Test that KVM correctly virtualizes the AMD PMU Host-Only (bit 41) and
+ * Guest-Only (bit 40) event selector bits across all SVM state
+ * transitions.
+ *
+ * Programs 4 PMCs simultaneously with all combinations of Host-Only and
+ * Guest-Only bits, then verifies correct counting behavior with different
+ * combinations of EFER.SVME and host/guest mode -- as well as event filtering.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "pmu.h"
+
+#define EVENTSEL_RETIRED_INSNS (ARCH_PERFMON_EVENTSEL_OS | \
+ ARCH_PERFMON_EVENTSEL_USR | \
+ ARCH_PERFMON_EVENTSEL_ENABLE | \
+ AMD_ZEN_INSTRUCTIONS_RETIRED)
+
+/* PMC configurations: index corresponds to Host-Only | Guest-Only bits */
+#define PMC_NONE 0 /* Neither bit set */
+#define PMC_G 1 /* Guest-Only bit set */
+#define PMC_H 2 /* Host-Only bit set */
+#define PMC_HG 3 /* Both bits set */
+#define NR_PMCS 4
+
+#define LOOP_INSNS 1000
+
+static __always_inline void run_instruction_loop(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < LOOP_INSNS; i++)
+ __asm__ __volatile__("nop");
+}
+
+static __always_inline void read_counters(uint64_t *counts)
+{
+ int i;
+
+ for (i = 0; i < NR_PMCS; i++)
+ counts[i] = rdmsr(MSR_F15H_PERF_CTR + 2 * i);
+}
+
+static __always_inline void run_and_measure(uint64_t *deltas)
+{
+ uint64_t before[NR_PMCS], after[NR_PMCS];
+ int i;
+
+ read_counters(before);
+ run_instruction_loop();
+ read_counters(after);
+
+ for (i = 0; i < NR_PMCS; i++)
+ deltas[i] = after[i] - before[i];
+}
+
+static void assert_pmc_counts(uint64_t *deltas, unsigned int expected_counting)
+{
+ int i;
+
+ for (i = 0; i < NR_PMCS; i++) {
+ if (expected_counting & BIT(i))
+ GUEST_ASSERT_NE(deltas[i], 0);
+ else
+ GUEST_ASSERT_EQ(deltas[i], 0);
+ }
+}
+
+static uint64_t l2_deltas[NR_PMCS];
+
+static void l2_guest_code(void)
+{
+ run_and_measure(l2_deltas);
+ vmmcall();
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+ struct vmcb *vmcb = svm->vmcb;
+ uint64_t deltas[NR_PMCS];
+ uint64_t eventsel;
+ int i;
+
+ /* Program 4 PMCs with all combinations of Host-Only/Guest-Only bits */
+ for (i = 0; i < NR_PMCS; i++) {
+ eventsel = EVENTSEL_RETIRED_INSNS;
+ if (i & PMC_G)
+ eventsel |= AMD64_EVENTSEL_GUESTONLY;
+ if (i & PMC_H)
+ eventsel |= AMD64_EVENTSEL_HOSTONLY;
+ wrmsr(MSR_F15H_PERF_CTL + 2 * i, eventsel);
+ wrmsr(MSR_F15H_PERF_CTR + 2 * i, 0);
+ }
+
+ /* Step 1: SVME=0 - Only the counter with neither bits set counts */
+ wrmsr(MSR_EFER, rdmsr(MSR_EFER) & ~EFER_SVME);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE));
+
+ /* Step 2: Set SVME=1 - In L1 "host mode"; Guest-Only stops */
+ wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SVME);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG));
+
+ /* Step 3: VMRUN to L2 - In "guest mode"; Host-Only stops */
+ generic_svm_setup(svm, l2_guest_code);
+ vmcb->control.intercept &= ~(1ULL << INTERCEPT_MSR_PROT);
+
+ run_guest(vmcb, svm->vmcb_gpa);
+
+ GUEST_ASSERT_EQ(vmcb->control.exit_code, SVM_EXIT_VMMCALL);
+ assert_pmc_counts(l2_deltas, BIT(PMC_NONE) | BIT(PMC_G) | BIT(PMC_HG));
+
+ /* Step 4: After VMEXIT to L1 - Back in "host mode"; Guest-Only stops */
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG));
+
+ /* Step 5: Set KVM_PMU_EVENT_DENY - all counters stop */
+ GUEST_SYNC(KVM_PMU_EVENT_DENY);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, 0);
+
+ /* Step 6: Set KVM_PMU_EVENT_ALLOW - back to all except Guest-only */
+ GUEST_SYNC(KVM_PMU_EVENT_ALLOW);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG));
+
+ /* Step 7: Clear Host-Only for PMC_HG - counter stops in "host mode" */
+ eventsel = rdmsr(MSR_F15H_PERF_CTL + 2 * PMC_HG);
+ wrmsr(MSR_F15H_PERF_CTL + 2 * PMC_HG, eventsel & ~AMD64_EVENTSEL_HOSTONLY);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H));
+
+ /* Step 8: Restore Host-Only for PMC_HG - counter counts again */
+ wrmsr(MSR_F15H_PERF_CTL + 2 * PMC_HG, eventsel);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE) | BIT(PMC_H) | BIT(PMC_HG));
+
+ /* Step 9: Clear SVME - Only the counter with neither bits set counts */
+ wrmsr(MSR_EFER, rdmsr(MSR_EFER) & ~EFER_SVME);
+ run_and_measure(deltas);
+ assert_pmc_counts(deltas, BIT(PMC_NONE));
+
+ GUEST_DONE();
+}
+
+static struct kvm_pmu_event_filter *alloc_event_filter(u64 event)
+{
+ struct kvm_pmu_event_filter *filter;
+
+ filter = malloc(sizeof(*filter) + sizeof(event));
+ TEST_ASSERT(filter != NULL, "Filter allocation failed");
+
+ memset(filter, 0, sizeof(*filter));
+ memcpy(filter->events, &event, sizeof(event));
+ filter->nevents = 1;
+ filter->action = KVM_PMU_EVENT_ALLOW;
+
+ return filter;
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_pmu_event_filter *filter;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ struct ucall uc;
+ gva_t svm_gva;
+
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+ TEST_REQUIRE(kvm_is_pmu_enabled());
+ TEST_REQUIRE(kvm_is_mediated_pmu_enabled());
+
+ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+ vcpu_alloc_svm(vm, &svm_gva);
+ vcpu_args_set(vcpu, 1, svm_gva);
+
+ filter = alloc_event_filter(AMD_ZEN_INSTRUCTIONS_RETIRED);
+
+ for (;;) {
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ goto done;
+ case UCALL_DONE:
+ goto done;
+ case UCALL_SYNC:
+ filter->action = uc.args[1];
+ vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, filter);
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ goto done;
+ }
+ }
+done:
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86/svm_vmcall_test.c
index b1887242f3b8..7c57fb7e6422 100644
--- a/tools/testing/selftests/kvm/x86/svm_vmcall_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_vmcall_test.c
@@ -19,13 +19,10 @@ static void l2_guest_code(struct svm_test_data *svm)
static void l1_guest_code(struct svm_test_data *svm)
{
- #define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
/* Prepare for L2 execution. */
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
run_guest(vmcb, svm->vmcb_gpa);
diff --git a/tools/testing/selftests/kvm/x86/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86/triple_fault_event_test.c
index f1c488e0d497..0d83516f4bd0 100644
--- a/tools/testing/selftests/kvm/x86/triple_fault_event_test.c
+++ b/tools/testing/selftests/kvm/x86/triple_fault_event_test.c
@@ -21,9 +21,6 @@ static void l2_guest_code(void)
: : [port] "d" (ARBITRARY_IO_PORT) : "rax");
}
-#define L2_GUEST_STACK_SIZE 64
-unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
void l1_guest_code_vmx(struct vmx_pages *vmx)
{
@@ -31,8 +28,7 @@ void l1_guest_code_vmx(struct vmx_pages *vmx)
GUEST_ASSERT(prepare_for_vmx_operation(vmx));
GUEST_ASSERT(load_vmcs(vmx));
- prepare_vmcs(vmx, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx, l2_guest_code);
GUEST_ASSERT(!vmlaunch());
/* L2 should triple fault after a triple fault event injected. */
@@ -44,8 +40,7 @@ void l1_guest_code_svm(struct svm_test_data *svm)
{
struct vmcb *vmcb = svm->vmcb;
- generic_svm_setup(svm, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ generic_svm_setup(svm, l2_guest_code);
/* don't intercept shutdown to test the case of SVM allowing to do so */
vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
diff --git a/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c
index 1720113eae79..463f73aa9159 100644
--- a/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_apic_access_test.c
@@ -36,16 +36,13 @@ static void l2_guest_code(void)
static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u32 control;
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
/* Prepare the VMCS for L2 execution. */
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
diff --git a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c
index 80a4fd1e5bbb..f9b88a6f6113 100644
--- a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c
@@ -31,15 +31,13 @@ static void l2_guest_code(void)
static void l1_guest_code(struct vmx_pages *vmx_pages)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u32 control;
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
/* Prepare the VMCS for L2 execution. */
- prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
control |= CPU_BASED_USE_MSR_BITMAPS;
vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
diff --git a/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c
index a2eaceed9ad5..6d88c54f69fa 100644
--- a/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c
+++ b/tools/testing/selftests/kvm/x86/vmx_invalid_nested_guest_state.c
@@ -25,15 +25,11 @@ static void l2_guest_code(void)
static void l1_guest_code(struct vmx_pages *vmx_pages)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
/* Prepare the VMCS for L2 execution. */
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
/*
* L2 must be run without unrestricted guest, verify that the selftests
diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c
index f13dee317383..75073efa926d 100644
--- a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c
@@ -27,8 +27,6 @@ static void l2_guest_code(void)
static void l1_guest_code(struct vmx_pages *vmx_pages)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u64 guest_cr4;
gpa_t pml5_pa, pml4_pa;
u64 *pml5;
@@ -42,8 +40,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
/*
* Set up L2 with a 4-level page table by pointing its CR3 to
diff --git a/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c
index 1b7b6ba23de7..eb8021c33cd4 100644
--- a/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_preemption_timer_test.c
@@ -66,8 +66,6 @@ void l2_guest_code(void)
void l1_guest_code(struct vmx_pages *vmx_pages)
{
-#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
u64 l1_vmx_pt_start;
u64 l1_vmx_pt_finish;
u64 l1_tsc_deadline, l2_tsc_deadline;
@@ -77,8 +75,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
GUEST_ASSERT(load_vmcs(vmx_pages));
GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
- prepare_vmcs(vmx_pages, l2_guest_code,
- &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ prepare_vmcs(vmx_pages, l2_guest_code);
/*
* Check for Preemption timer support