diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-04-02 12:09:02 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-04-02 12:09:02 -0700 |
commit | 38904911e86495d4690f8d805720b90e65426c71 (patch) | |
tree | 6a162530ad117c636fc1e144ee223099b85eefd4 /arch | |
parent | 6f34f8c3d6178527d4c02aa3a53c370cc70cb91e (diff) | |
parent | c15e0ae42c8e5a61e9aca8aac920517cf7b3e94e (diff) | |
download | lwn-38904911e86495d4690f8d805720b90e65426c71.tar.gz lwn-38904911e86495d4690f8d805720b90e65426c71.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini:
- Only do MSR filtering for MSRs accessed by rdmsr/wrmsr
- Documentation improvements
- Prevent module exit until all VMs are freed
- PMU Virtualization fixes
- Fix for kvm_irq_delivery_to_apic_fast() NULL-pointer dereferences
- Other miscellaneous bugfixes
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (42 commits)
KVM: x86: fix sending PV IPI
KVM: x86/mmu: do compare-and-exchange of gPTE via the user address
KVM: x86: Remove redundant vm_entry_controls_clearbit() call
KVM: x86: cleanup enter_rmode()
KVM: x86: SVM: fix tsc scaling when the host doesn't support it
kvm: x86: SVM: remove unused defines
KVM: x86: SVM: move tsc ratio definitions to svm.h
KVM: x86: SVM: fix avic spec based definitions again
KVM: MIPS: remove reference to trap&emulate virtualization
KVM: x86: document limitations of MSR filtering
KVM: x86: Only do MSR filtering when access MSR by rdmsr/wrmsr
KVM: x86/emulator: Emulate RDPID only if it is enabled in guest
KVM: x86/pmu: Fix and isolate TSX-specific performance event logic
KVM: x86: mmu: trace kvm_mmu_set_spte after the new SPTE was set
KVM: x86/svm: Clear reserved bits written to PerfEvtSeln MSRs
KVM: x86: Trace all APICv inhibit changes and capture overall status
KVM: x86: Add wrappers for setting/clearing APICv inhibits
KVM: x86: Make APICv inhibit reasons an enum and cleanup naming
KVM: X86: Handle implicit supervisor access with SMAP
KVM: X86: Rename variable smap to not_smap in permission_fault()
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 46 | ||||
-rw-r--r-- | arch/x86/include/asm/svm.h | 14 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 22 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_emulate.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 32 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 82 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 72 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 14 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 36 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm_onhyperv.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 22 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 26 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 161 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 7 |
26 files changed, 350 insertions, 306 deletions
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ab73d99483fe..156d1c25a3c1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3462,7 +3462,7 @@ void exit_sie(struct kvm_vcpu *vcpu) /* Kick a guest cpu out of SIE to process a request synchronously */ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) { - kvm_make_request(req, vcpu); + __kvm_make_request(req, vcpu); kvm_s390_vcpu_request(vcpu); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4138939532c6..d23e80a56eb8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -249,6 +249,7 @@ enum x86_intercept_stage; #define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 +#define PFERR_IMPLICIT_ACCESS_BIT 48 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -259,6 +260,7 @@ enum x86_intercept_stage; #define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ @@ -430,7 +432,7 @@ struct kvm_mmu { void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t gva_or_gpa, u32 access, + gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); @@ -512,6 +514,7 @@ struct kvm_pmu { u64 global_ctrl_mask; u64 global_ovf_ctrl_mask; u64 reserved_bits; + u64 raw_event_mask; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; @@ -1040,14 +1043,16 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; -#define APICV_INHIBIT_REASON_DISABLE 0 -#define APICV_INHIBIT_REASON_HYPERV 1 -#define APICV_INHIBIT_REASON_NESTED 2 -#define APICV_INHIBIT_REASON_IRQWIN 3 -#define APICV_INHIBIT_REASON_PIT_REINJ 4 -#define APICV_INHIBIT_REASON_X2APIC 5 -#define APICV_INHIBIT_REASON_BLOCKIRQ 6 -#define APICV_INHIBIT_REASON_ABSENT 7 +enum kvm_apicv_inhibit { + APICV_INHIBIT_REASON_DISABLE, + APICV_INHIBIT_REASON_HYPERV, + APICV_INHIBIT_REASON_NESTED, + APICV_INHIBIT_REASON_IRQWIN, + APICV_INHIBIT_REASON_PIT_REINJ, + APICV_INHIBIT_REASON_X2APIC, + APICV_INHIBIT_REASON_BLOCKIRQ, + APICV_INHIBIT_REASON_ABSENT, +}; struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1401,7 +1406,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*check_apicv_inhibit_reasons)(ulong bit); + bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1585,7 +1590,7 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -void kvm_mmu_init_vm(struct kvm *kvm); +int kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); @@ -1795,11 +1800,22 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, bool kvm_apicv_activated(struct kvm *kvm); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); -void kvm_request_apicv_update(struct kvm *kvm, bool activate, - unsigned long bit); +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set); +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set); + +static inline void kvm_set_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason) +{ + kvm_set_or_clear_apicv_inhibit(kvm, reason, true); +} -void __kvm_request_apicv_update(struct kvm *kvm, bool activate, - unsigned long bit); +static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason) +{ + kvm_set_or_clear_apicv_inhibit(kvm, reason, false); +} int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 7eb2df5417fb..f70a5108d464 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -221,8 +221,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) +#define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL +#define SVM_TSC_RATIO_MIN 0x0000000000000001ULL +#define SVM_TSC_RATIO_MAX 0x000000ffffffffffULL +#define SVM_TSC_RATIO_DEFAULT 0x0100000000ULL + + /* AVIC */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFFULL) #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) @@ -230,9 +236,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) -#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFF) +#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL) + +#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define AVIC_DOORBELL_PHYSICAL_ID_MASK (0xFF) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 79e0b8d63ffa..a22deb58f86d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -517,7 +517,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; - } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a00cd97b2623..b24ca7f4ed7c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -735,6 +735,7 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, if (function > READ_ONCE(max_cpuid_80000000)) return entry; } + break; default: break; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f9c00c89829b..89b11e7dca8a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3540,8 +3540,10 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) { u64 tsc_aux = 0; - if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) + if (!ctxt->ops->guest_has_rdpid(ctxt)) return emulate_ud(ctxt); + + ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } @@ -3642,7 +3644,7 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data); if (r == X86EMUL_IO_NEEDED) return r; @@ -3659,7 +3661,7 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) u64 msr_data; int r; - r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data); if (r == X86EMUL_IO_NEEDED) return r; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a32f54ab84a2..123b677111c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -122,9 +122,13 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else hv->synic_auto_eoi_used--; - __kvm_request_apicv_update(vcpu->kvm, - !hv->synic_auto_eoi_used, - APICV_INHIBIT_REASON_HYPERV); + /* + * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on + * the hypervisor to manually inject IRQs. + */ + __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, + APICV_INHIBIT_REASON_HYPERV, + !!hv->synic_auto_eoi_used); up_write(&vcpu->kvm->arch.apicv_update_lock); } @@ -239,7 +243,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; - if (!synic->active && !host) + if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -285,6 +289,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, case HV_X64_MSR_EOM: { int i; + if (!synic->active) + break; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; @@ -449,6 +456,9 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) struct kvm_lapic_irq irq; int ret, vector; + if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return -EINVAL; + if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; @@ -661,7 +671,7 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); - if (!synic->active && !host) + if (!synic->active && (!host || config)) return 1; if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && @@ -690,7 +700,7 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); - if (!synic->active && !host) + if (!synic->active && (!host || count)) return 1; trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0b65a764ed3a..1c83076091af 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -305,15 +305,13 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) * So, deactivate APICv when PIT is in reinject mode. */ if (reinject) { - kvm_request_apicv_update(kvm, false, - APICV_INHIBIT_REASON_PIT_REINJ); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ); /* The initial state is preserved while ps->reinject == 0. */ kvm_pit_reset_reinject(pit); kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } else { - kvm_request_apicv_update(kvm, true, - APICV_INHIBIT_REASON_PIT_REINJ); + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ); kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 840ddb4a9302..8dff25d267b7 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -210,6 +210,8 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); + int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); + int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); @@ -226,6 +228,7 @@ struct x86_emulate_ops { bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 80a2020c4db4..66b0eb0bda94 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1024,6 +1024,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq->shorthand == APIC_DEST_SELF) { + if (KVM_BUG_ON(!src, kvm)) { + *r = 0; + return true; + } *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bf8dbc4bb12a..e6cae6f22683 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -214,27 +214,27 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pte_access, unsigned pte_pkey, - unsigned pfec) + u64 access) { - int cpl = static_call(kvm_x86_get_cpl)(vcpu); + /* strip nested paging fault error codes */ + unsigned int pfec = access; unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); /* - * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. + * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. + * For implicit supervisor accesses, SMAP cannot be overridden. * - * If CPL = 3, SMAP applies to all supervisor-mode data accesses - * (these are implicit supervisor accesses) regardless of the value - * of EFLAGS.AC. + * SMAP works on supervisor accesses only, and not_smap can + * be set or not set when user access with neither has any bearing + * on the result. * - * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving - * the result in X86_EFLAGS_AC. We then insert it in place of - * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec, - * but it will be one in index if SMAP checks are being overridden. - * It is important to keep this branchless. + * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit; + * this bit will always be zero in pfec, but it will be one in index + * if SMAP checks are being disabled. */ - unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); - int index = (pfec >> 1) + - (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; + bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; + int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; @@ -317,12 +317,12 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) atomic64_add(count, &kvm->stat.pages[level - 1]); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception); static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t gpa, u32 access, + gpa_t gpa, u64 access, struct x86_exception *exception) { if (mmu != &vcpu->arch.nested_mmu) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 51671cb34fb6..8f19ea752704 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2696,8 +2696,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (*sptep == spte) { ret = RET_PF_SPURIOUS; } else { - trace_kvm_mmu_set_spte(level, gfn, sptep); flush |= mmu_spte_update(sptep, spte); + trace_kvm_mmu_set_spte(level, gfn, sptep); } if (wrprot) { @@ -3703,7 +3703,7 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t vaddr, u32 access, + gpa_t vaddr, u64 access, struct x86_exception *exception) { if (exception) @@ -4591,11 +4591,11 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch - * - Page fault in kernel mode - * - if CPL = 3 or X86_EFLAGS_AC is clear + * - The access is supervisor mode + * - If implicit supervisor access or X86_EFLAGS_AC is clear * - * Here, we cover the first three conditions. - * The fourth is computed dynamically in permission_fault(); + * Here, we cover the first four conditions. + * The fifth is computed dynamically in permission_fault(); * PFERR_RSVD_MASK bit will be set in PFEC if the access is * *not* subject to SMAP restrictions. */ @@ -5768,17 +5768,24 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, kvm_mmu_zap_all_fast(kvm); } -void kvm_mmu_init_vm(struct kvm *kvm) +int kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + int r; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - kvm_mmu_init_tdp_mmu(kvm); + r = kvm_mmu_init_tdp_mmu(kvm); + if (r < 0) + return r; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + return 0; } void kvm_mmu_uninit_vm(struct kvm *kvm) @@ -5842,8 +5849,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush); + flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, + gfn_end, true, flush); } if (flush) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 252c77805eb9..01fee5f67ac3 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -34,9 +34,8 @@ #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG cmpxchg + #define CMPXCHG "cmpxchgq" #else - #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 2 #endif #elif PTTYPE == 32 @@ -52,7 +51,7 @@ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG cmpxchg + #define CMPXCHG "cmpxchgl" #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -65,7 +64,9 @@ #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) - #define CMPXCHG cmpxchg64 + #ifdef CONFIG_X86_64 + #define CMPXCHG "cmpxchgq" + #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value @@ -147,43 +148,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) { - int npages; - pt_element_t ret; - pt_element_t *table; - struct page *page; - - npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); - if (likely(npages == 1)) { - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); - } else { - struct vm_area_struct *vma; - unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; - unsigned long pfn; - unsigned long paddr; - - mmap_read_lock(current->mm); - vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); - if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - paddr = pfn << PAGE_SHIFT; - table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); - if (!table) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - ret = CMPXCHG(&table[index], orig_pte, new_pte); - memunmap(table); - mmap_read_unlock(current->mm); - } + signed char r; - return (ret != orig_pte); + if (!user_access_begin(ptep_user, sizeof(pt_element_t))) + return -EFAULT; + +#ifdef CMPXCHG + asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n" + "setnz %b[r]\n" + "2:" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) + : [ptr] "+m" (*ptep_user), + [old] "+a" (orig_pte), + [r] "=q" (r) + : [new] "r" (new_pte) + : "memory"); +#else + asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n" + "setnz %b[r]\n" + "2:" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r]) + : [ptr] "+m" (*ptep_user), + [old] "+A" (orig_pte), + [r] "=q" (r) + : [new_lo] "b" ((u32)new_pte), + [new_hi] "c" ((u32)(new_pte >> 32)) + : "memory"); +#endif + + user_access_end(); + return r; } static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, @@ -339,7 +333,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t addr, u32 access) + gpa_t addr, u64 access) { int ret; pt_element_t pte; @@ -347,7 +341,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; u64 pt_access, pte_access; unsigned index, accessed_dirty, pte_pkey; - unsigned nested_access; + u64 nested_access; gpa_t pte_gpa; bool have_ad; int offset; @@ -540,7 +534,7 @@ error: } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gpa_t addr, u32 access) + struct kvm_vcpu *vcpu, gpa_t addr, u64 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); @@ -988,7 +982,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gpa_t addr, u32 access, + gpa_t addr, u64 access, struct x86_exception *exception) { struct guest_walker walker; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e7e7876251b3..d71d177ae6b8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,21 +14,24 @@ static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) +int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { + struct workqueue_struct *wq; + if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return false; + return 0; + + wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); + if (!wq) + return -ENOMEM; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); - kvm->arch.tdp_mmu_zap_wq = - alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); - - return true; + kvm->arch.tdp_mmu_zap_wq = wq; + return 1; } /* Arbitrarily returns true so that this may be used in if statements. */ @@ -906,10 +909,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs + * have been cleared and a TLB flush is needed before releasing the MMU lock. * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this @@ -917,42 +918,25 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) * the caller must ensure it does not supply too large a GFN range, or the * operation can cause a soft lockup. */ -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush) +static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield, bool flush) { - bool zap_all = (start == 0 && end >= tdp_mmu_max_gfn_host()); struct tdp_iter iter; - /* - * No need to try to step down in the iterator when zapping all SPTEs, - * zapping the top-level non-leaf SPTEs will recurse on their children. - */ - int min_level = zap_all ? root->role.level : PG_LEVEL_4K; - end = min(end, tdp_mmu_max_gfn_host()); lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, min_level, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; continue; } - if (!is_shadow_present_pte(iter.old_spte)) - continue; - - /* - * If this is a non-last-level SPTE that covers a larger range - * than should be zapped, continue, and zap the mappings at a - * lower level, except when zapping all SPTEs. - */ - if (!zap_all && - (iter.gfn < start || - iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; @@ -960,17 +944,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, flush = true; } - /* - * Need to flush before releasing RCU. TODO: do it only if intermediate - * page tables were zapped; there is no need to flush under RCU protection - * if no 'struct kvm_mmu_page' is freed. - */ - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, start, end - start); - rcu_read_unlock(); - return false; + /* + * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need + * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. + */ + return flush; } /* @@ -979,13 +959,13 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush) +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, + bool can_yield, bool flush) { struct kvm_mmu_page *root; for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); + flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); return flush; } @@ -1233,8 +1213,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start, - range->end, range->may_block, flush); + return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, + range->end, range->may_block, flush); } typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 5e5ef2576c81..c163f7cc23ca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -15,14 +15,8 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush); -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush) -{ - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); -} - bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); @@ -72,7 +66,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); +int kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } @@ -93,7 +87,7 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else -static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b1a02993782b..eca39f56c231 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -96,8 +96,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, - bool exclude_kernel, bool intr, - bool in_tx, bool in_tx_cp) + bool exclude_kernel, bool intr) { struct perf_event *event; struct perf_event_attr attr = { @@ -116,16 +115,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, attr.sample_period = get_sample_period(pmc, pmc->counter); - if (in_tx) - attr.config |= HSW_IN_TX; - if (in_tx_cp) { + if ((attr.config & HSW_IN_TX_CHECKPOINTED) && + guest_cpuid_is_intel(pmc->vcpu)) { /* * HSW_IN_TX_CHECKPOINTED is not supported with nonzero * period. Just clear the sample period so at least * allocating the counter doesn't fail. */ attr.sample_period = 0; - attr.config |= HSW_IN_TX_CHECKPOINTED; } event = perf_event_create_kernel_counter(&attr, -1, current, @@ -185,6 +182,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) u32 type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; + struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -221,7 +219,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) } if (type == PERF_TYPE_RAW) - config = eventsel & AMD64_RAW_EVENT_MASK; + config = eventsel & pmu->raw_event_mask; if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) return; @@ -232,9 +230,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT, - (eventsel & HSW_IN_TX), - (eventsel & HSW_IN_TX_CHECKPOINTED)); + eventsel & ARCH_PERFMON_EVENTSEL_INT); } EXPORT_SYMBOL_GPL(reprogram_gp_counter); @@ -270,7 +266,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ - pmi, false, false); + pmi); } EXPORT_SYMBOL_GPL(reprogram_fixed_counter); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b37b353ec086..a1cf9c31273b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -726,7 +726,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP)) @@ -737,7 +737,13 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { struct vcpu_data vcpu_info; @@ -822,7 +828,7 @@ out: return ret; } -bool avic_check_apicv_inhibit_reasons(ulong bit) +bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | @@ -833,7 +839,7 @@ bool avic_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); - return supported & BIT(bit); + return supported & BIT(reason); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index d4de52409335..24eb935b6f85 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -262,12 +262,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { + data &= ~pmu->reserved_bits; + if (data != pmc->eventsel) reprogram_gp_counter(pmc, data); - return 0; - } + return 0; } return 1; @@ -284,6 +282,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; + pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0884c3414a1b..bd4c64b362d2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -62,20 +62,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) - #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) -#define TSC_RATIO_RSVD 0xffffff0000000000ULL -#define TSC_RATIO_MIN 0x0000000000000001ULL -#define TSC_RATIO_MAX 0x000000ffffffffffULL - static bool erratum_383_found __read_mostly; u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; @@ -87,7 +75,6 @@ u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define TSC_RATIO_DEFAULT 0x0100000000ULL static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ @@ -480,7 +467,7 @@ static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ if (tsc_scaling) - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -526,8 +513,8 @@ static int svm_hardware_enable(void) * Set the default value, even if we don't use TSC scaling * to avoid having stale value in the msr */ - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); + wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); + __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT); } @@ -2723,7 +2710,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; } - if (data & TSC_RATIO_RSVD) + if (data & SVM_TSC_RATIO_RSVD) return 1; svm->tsc_ratio_msr = data; @@ -2918,7 +2905,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); + kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3516,7 +3503,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } @@ -3948,6 +3935,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; + struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -3974,16 +3962,14 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * is exposed to the guest, disable AVIC. */ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_X2APIC); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); /* * Currently, AVIC does not work with nested virtualization. * So, we disable AVIC when cpuid for SVM is set in the L1 guest. */ if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_NESTED); + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED); } init_vmcb_after_set_cpuid(vcpu); } @@ -4766,10 +4752,10 @@ static __init int svm_hardware_setup(void) } else { pr_info("TSC scaling supported\n"); kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; } } + kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e37bb3508cfa..f77a7d2d39dd 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -22,6 +22,8 @@ #include <asm/svm.h> #include <asm/sev-common.h> +#include "kvm_cache_regs.h" + #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) #define IOPM_SIZE PAGE_SIZE * 3 @@ -569,17 +571,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -592,7 +583,7 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool avic_check_apicv_inhibit_reasons(ulong bit); +bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 98aa981c04ec..8cdc62c74a96 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -4,7 +4,6 @@ */ #include <linux/kvm_host.h> -#include "kvm_cache_regs.h" #include <asm/mshyperv.h> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 193f5ba930d1..e3a24b8f04be 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1339,23 +1339,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); -TRACE_EVENT(kvm_apicv_update_request, - TP_PROTO(bool activate, unsigned long bit), - TP_ARGS(activate, bit), +TRACE_EVENT(kvm_apicv_inhibit_changed, + TP_PROTO(int reason, bool set, unsigned long inhibits), + TP_ARGS(reason, set, inhibits), TP_STRUCT__entry( - __field(bool, activate) - __field(unsigned long, bit) + __field(int, reason) + __field(bool, set) + __field(unsigned long, inhibits) ), TP_fast_assign( - __entry->activate = activate; - __entry->bit = bit; + __entry->reason = reason; + __entry->set = set; + __entry->inhibits = inhibits; ), - TP_printk("%s bit=%lu", - __entry->activate ? "activate" : "deactivate", - __entry->bit) + TP_printk("%s reason=%u, inhibits=0x%lx", + __entry->set ? "set" : "cleared", + __entry->reason, __entry->inhibits) ); TRACE_EVENT(kvm_apicv_accept_irq, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0684e519181b..bc3f8512bb64 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -389,6 +389,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct kvm_pmc *pmc; u32 msr = msr_info->index; u64 data = msr_info->data; + u64 reserved_bits; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -443,7 +444,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & pmu->reserved_bits)) { + reserved_bits = pmu->reserved_bits; + if ((pmc->idx == 2) && + (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) + reserved_bits ^= HSW_IN_TX_CHECKPOINTED; + if (!(data & reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -485,6 +490,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry || !vcpu->kvm->arch.enable_pmu) @@ -533,8 +539,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) entry = kvm_find_cpuid_entry(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) - pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { + pmu->reserved_bits ^= HSW_IN_TX; + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + } bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e8963f5af618..04d170c4b61e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2866,21 +2866,17 @@ static void enter_rmode(struct kvm_vcpu *vcpu) int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); /* Nothing to do if hardware doesn't support EFER. */ - if (!msr) + if (!vmx_find_uret_msr(vmx, MSR_EFER)) return 0; vcpu->arch.efer = efer; - if (efer & EFER_LMA) { - vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); + if (efer & EFER_LMA) + vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); + else + vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); - msr->data = efer & ~EFER_LME; - } vmx_setup_uret_msrs(vmx); return 0; } @@ -2906,7 +2902,6 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } @@ -7705,14 +7700,14 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -static bool vmx_check_apicv_inhibit_reasons(ulong bit) +static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); - return supported & BIT(bit); + return supported & BIT(reason); } static struct kvm_x86_ops vmx_x86_ops __initdata = { @@ -7980,12 +7975,11 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - if (cpu_has_vmx_tsc_scaling()) { + if (cpu_has_vmx_tsc_scaling()) kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02cf0a7e1d14..0c0ca599a353 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1748,9 +1748,6 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; - if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; - switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1832,9 +1829,6 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; - if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; - switch (index) { case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) @@ -1871,6 +1865,20 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, return ret; } +static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + return kvm_get_msr_ignored_check(vcpu, index, data, false); +} + +static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + return kvm_set_msr_ignored_check(vcpu, index, data, false); +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); @@ -1953,7 +1961,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) u64 data; int r; - r = kvm_get_msr(vcpu, ecx, &data); + r = kvm_get_msr_with_filter(vcpu, ecx, &data); if (!r) { trace_kvm_msr_read(ecx, data); @@ -1978,7 +1986,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) u64 data = kvm_read_edx_eax(vcpu); int r; - r = kvm_set_msr(vcpu, ecx, data); + r = kvm_set_msr_with_filter(vcpu, ecx, data); if (!r) { trace_kvm_msr_write(ecx, data); @@ -5938,7 +5946,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; - kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -6335,7 +6343,7 @@ set_identity_unlock: /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; - kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -6726,7 +6734,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, static_call(kvm_x86_get_segment)(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -6746,7 +6754,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); @@ -6756,7 +6764,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } @@ -6766,7 +6774,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } @@ -6782,7 +6790,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -6819,7 +6827,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -6844,7 +6852,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -6863,9 +6871,11 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception, bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = 0; + u64 access = 0; - if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -6881,7 +6891,7 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, } static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -6915,9 +6925,11 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = PFERR_WRITE_MASK; + u64 access = PFERR_WRITE_MASK; - if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -6984,7 +6996,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, bool write) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) + u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -7627,13 +7639,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, return; } -static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 *pdata) +static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr(vcpu, msr_index, pdata); + r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, complete_emulated_rdmsr, r)) { @@ -7644,13 +7656,13 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, return r; } -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) +static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr(vcpu, msr_index, data); + r = kvm_set_msr_with_filter(vcpu, msr_index, data); if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, complete_emulated_msr_access, r)) { @@ -7661,6 +7673,18 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return r; } +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) +{ + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); +} + +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); +} + static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); @@ -7724,6 +7748,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } +static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); @@ -7794,6 +7823,8 @@ static const struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .get_smbase = emulator_get_smbase, .set_smbase = emulator_set_smbase, + .set_msr_with_filter = emulator_set_msr_with_filter, + .get_msr_with_filter = emulator_get_msr_with_filter, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, @@ -7806,6 +7837,7 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, + .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .exiting_smm = emulator_exiting_smm, @@ -9058,15 +9090,29 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); + +static void set_or_clear_apicv_inhibit(unsigned long *inhibits, + enum kvm_apicv_inhibit reason, bool set) +{ + if (set) + __set_bit(reason, inhibits); + else + __clear_bit(reason, inhibits); + + trace_kvm_apicv_inhibit_changed(reason, set, *inhibits); +} + static void kvm_apicv_init(struct kvm *kvm) { + unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons; + init_rwsem(&kvm->arch.apicv_update_lock); - set_bit(APICV_INHIBIT_REASON_ABSENT, - &kvm->arch.apicv_inhibit_reasons); + set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true); + if (!enable_apicv) - set_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); + set_or_clear_apicv_inhibit(inhibits, + APICV_INHIBIT_REASON_ABSENT, true); } static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) @@ -9740,24 +9786,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) { unsigned long old, new; lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) + if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason)) return; old = new = kvm->arch.apicv_inhibit_reasons; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); + set_or_clear_apicv_inhibit(&new, reason, set); if (!!old != !!new) { - trace_kvm_apicv_update_request(activate, bit); /* * Kick all vCPUs before setting apicv_inhibit_reasons to avoid * false positives in the sanity check WARN in svm_vcpu_run(). @@ -9776,20 +9819,22 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); kvm_zap_gfn_range(kvm, gfn, gfn+1); } - } else + } else { kvm->arch.apicv_inhibit_reasons = new; + } } -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) { if (!enable_apicv) return; down_write(&kvm->arch.apicv_update_lock); - __kvm_request_apicv_update(kvm, activate, bit); + __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); up_write(&kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_request_apicv_update); +EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -10937,7 +10982,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) { - bool inhibit = false; + bool set = false; struct kvm_vcpu *vcpu; unsigned long i; @@ -10945,11 +10990,11 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { - inhibit = true; + set = true; break; } } - __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ); + __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set); up_write(&kvm->arch.apicv_update_lock); } @@ -11557,10 +11602,8 @@ int kvm_arch_hardware_setup(void *opaque) u64 max = min(0x7fffffffULL, __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } - + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } @@ -11629,12 +11672,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) ret = kvm_page_track_init(kvm); if (ret) - return ret; + goto out; + + ret = kvm_mmu_init_vm(kvm); + if (ret) + goto out_page_track; INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -11666,10 +11710,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_mmu_init_vm(kvm); kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); + +out_page_track: + kvm_page_track_cleanup(kvm); +out: + return ret; } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -12593,7 +12641,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; struct x86_exception fault; - u32 access = error_code & + u64 access = error_code & (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || @@ -12933,7 +12981,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 4aa0f2b31665..bf6cc25eee76 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -39,8 +39,8 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) } do { - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, - gpa, PAGE_SIZE, false); + ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, + gpa, PAGE_SIZE); if (ret) goto out; @@ -1025,8 +1025,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, - PAGE_SIZE, false); + rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); |