diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 869 |
1 files changed, 474 insertions, 395 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbabb2f06273..b8124b562dea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -22,6 +22,7 @@ #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" @@ -81,7 +82,7 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -109,7 +110,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); -struct kvm_x86_ops *kvm_x86_ops __read_mostly; +struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; @@ -180,7 +181,17 @@ struct kvm_shared_msrs { static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) + +u64 __read_mostly host_efer; +EXPORT_SYMBOL_GPL(host_efer); + static u64 __read_mostly host_xss; +u64 __read_mostly supported_xss; +EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -226,10 +237,25 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { }; u64 __read_mostly host_xcr0; +u64 __read_mostly supported_xcr0; +EXPORT_SYMBOL_GPL(supported_xcr0); struct kmem_cache *x86_fpu_cache; EXPORT_SYMBOL_GPL(x86_fpu_cache); +static struct kmem_cache *x86_emulator_cache; + +static struct kmem_cache *kvm_alloc_emulator_cache(void) +{ + unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); + unsigned int size = sizeof(struct x86_emulate_ctxt); + + return kmem_cache_create_usercopy("x86_emulator", size, + __alignof__(struct x86_emulate_ctxt), + SLAB_ACCOUNT, useroffset, + size - useroffset, NULL); +} + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) @@ -350,6 +376,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } kvm_lapic_set_base(vcpu, msr_info->data); + kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); @@ -438,6 +465,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * for #DB exceptions under VMX. */ vcpu->arch.dr6 ^= payload & DR6_RTM; + + /* + * The #DB payload is defined as compatible with the 'pending + * debug exceptions' field under VMX, not DR6. While bit 12 is + * defined in the 'pending debug exceptions' field (enabled + * breakpoint), it is reserved and must be zero in DR6. + */ + vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: vcpu->arch.cr2 = payload; @@ -490,19 +525,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; - /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). However, for ABI - * compatibility with KVM_GET_VCPU_EVENTS and - * KVM_SET_VCPU_EVENTS, we can't delay payload - * delivery unless userspace has enabled this - * functionality via the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD. - */ - if (!vcpu->kvm->arch.exception_payload_enabled || - !is_guest_mode(vcpu)) + if (!is_guest_mode(vcpu)) kvm_deliver_exception_payload(vcpu); return; } @@ -623,7 +646,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -764,7 +787,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } else @@ -777,7 +800,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); @@ -873,7 +896,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (kvm_x86_ops->get_cpl(vcpu) != 0 || + if (kvm_x86_ops.get_cpl(vcpu) != 0 || __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; @@ -907,10 +930,10 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (cpuid_ecx(0x7) & feature_bit(LA57)) + if (kvm_cpu_cap_has(X86_FEATURE_LA57)) reserved_bits &= ~X86_CR4_LA57; - if (kvm_x86_ops->umip_emulated()) + if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) reserved_bits &= ~X86_CR4_UMIP; return reserved_bits; @@ -954,7 +977,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (kvm_x86_ops->set_cr4(vcpu, cr4)) + if (kvm_x86_ops.set_cr4(vcpu, cr4)) return 1; if (((cr4 ^ old_cr4) & pdptr_bits) || @@ -1038,7 +1061,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) static void kvm_update_dr6(struct kvm_vcpu *vcpu) { if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); + kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); } static void kvm_update_dr7(struct kvm_vcpu *vcpu) @@ -1049,7 +1072,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops->set_dr7(vcpu, dr7); + kvm_x86_ops.set_dr7(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1119,7 +1142,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) *val = vcpu->arch.dr6; else - *val = kvm_x86_ops->get_dr6(vcpu); + *val = kvm_x86_ops.get_dr6(vcpu); break; case 5: /* fall through */ @@ -1354,7 +1377,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops->get_msr_feature(msr)) + if (kvm_x86_ops.get_msr_feature(msr)) return 1; } return 0; @@ -1422,7 +1445,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + kvm_x86_ops.set_efer(vcpu, efer); /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1478,7 +1501,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return kvm_x86_ops->set_msr(vcpu, &msr); + return kvm_x86_ops.set_msr(vcpu, &msr); } /* @@ -1496,7 +1519,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = kvm_x86_ops->get_msr(vcpu, &msr); + ret = kvm_x86_ops.get_msr(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -1558,12 +1581,19 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); */ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) { - if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) + return 1; + + if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + data &= ~(1 << 12); + kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); + trace_kvm_apic_write(APIC_ICR, (u32)data); + return 0; } return 1; @@ -1572,11 +1602,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); + u64 data; int ret = 0; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): + data = kvm_read_edx_eax(vcpu); ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; default: @@ -1635,7 +1666,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; @@ -1643,7 +1674,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; vdata->clock.offset = tk->tkr_mono.base; - vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; @@ -1844,7 +1875,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) static inline int gtod_is_based_on_tsc(int mode) { - return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; + return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) @@ -1877,7 +1908,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1919,7 +1950,7 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } @@ -1927,7 +1958,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -1937,7 +1968,7 @@ static inline bool kvm_check_tsc_unstable(void) * TSC is marked unstable when we're running on Hyper-V, * 'TSC page' clocksource is good. */ - if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) return false; #endif return check_tsc_unstable(); @@ -2051,7 +2082,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } @@ -2092,30 +2123,30 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, u64 tsc_pg_val; switch (clock->vclock_mode) { - case VCLOCK_HVCLOCK: + case VDSO_CLOCKMODE_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ - *mode = VCLOCK_HVCLOCK; + *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & clock->mask; } else { /* TSC page invalid */ - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } break; - case VCLOCK_TSC: - *mode = VCLOCK_TSC; + case VDSO_CLOCKMODE_TSC: + *mode = VDSO_CLOCKMODE_TSC; *tsc_timestamp = read_tsc(); v = (*tsc_timestamp - clock->cycle_last) & clock->mask; break; default: - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } - if (*mode == VCLOCK_NONE) + if (*mode == VDSO_CLOCKMODE_NONE) *tsc_timestamp = v = 0; return v * clock->mult; @@ -2448,7 +2479,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - WARN_ON(vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -2527,7 +2557,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd(vcpu)) + if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@ -2649,7 +2679,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -2802,12 +2832,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* - * We do support PT if kvm_x86_ops->pt_supported(), but we do - * not support IA32_XSS[bit 8]. Guests will have to use - * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT - * MSRs. + * KVM supports exposing PT to the guest, but does not support + * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than + * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data != 0) + if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; break; @@ -3081,7 +3110,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; case MSR_IA32_TSCDEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; @@ -3164,7 +3192,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); - break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -3369,10 +3396,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); + r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + r = !kvm_x86_ops.cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; @@ -3399,14 +3426,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.get_nested_state ? + kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops->enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops->nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_enable_evmcs != NULL; break; default: break; @@ -3468,7 +3495,7 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { + case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_mce_cap_supported, sizeof(kvm_mce_cap_supported))) @@ -3500,9 +3527,9 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; - } default: r = -EINVAL; + break; } out: return r; @@ -3522,14 +3549,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) + if (kvm_x86_ops.has_wbinvd_exit()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_x86_ops.vcpu_load(vcpu, cpu); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -3596,7 +3623,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); /* * Disable page faults because we're in atomic context here. @@ -3615,7 +3642,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_steal_time_set_preempted(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); pagefault_enable(); - kvm_x86_ops->vcpu_put(vcpu); + kvm_x86_ops.vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* * If userspace has set any breakpoints or watchpoints, dr6 is restored @@ -3629,7 +3656,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -3737,7 +3764,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops.setup_mce(vcpu); out: return r; } @@ -3796,6 +3823,21 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, process_nmi(vcpu); /* + * In guest mode, payload delivery should be deferred, + * so that the L1 hypervisor can intercept #PF before + * CR2 is modified (or intercept #DB before DR6 is + * modified under nVMX). Unless the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of + * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we + * opportunistically defer the exception payload, deliver it if the + * capability hasn't been requested before processing a + * KVM_GET_VCPU_EVENTS. + */ + if (!vcpu->kvm->arch.exception_payload_enabled && + vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) + kvm_deliver_exception_payload(vcpu); + + /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP * isn't advanced, we should expect to encounter the exception @@ -3826,11 +3868,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -3897,13 +3939,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, + kvm_x86_ops.set_interrupt_shadow(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -4090,8 +4132,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0() || - mxcsr & ~mxcsr_feature_mask) + if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { @@ -4178,9 +4219,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops->nested_enable_evmcs) + if (!kvm_x86_ops.nested_enable_evmcs) return -ENOTTY; - r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4189,10 +4230,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops->enable_direct_tlbflush) + if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; - return kvm_x86_ops->enable_direct_tlbflush(vcpu); + return kvm_x86_ops.enable_direct_tlbflush(vcpu); default: return -EINVAL; @@ -4495,7 +4536,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops->get_nested_state) + if (!kvm_x86_ops.get_nested_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4503,7 +4544,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) break; @@ -4525,7 +4566,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops->set_nested_state) + if (!kvm_x86_ops.set_nested_state) break; r = -EFAULT; @@ -4547,7 +4588,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -4591,14 +4632,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); + ret = kvm_x86_ops.set_tss_addr(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr); + return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -4750,77 +4791,13 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; -} - -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) -{ - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. - */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; + if (kvm_x86_ops.flush_log_dirty) + kvm_x86_ops.flush_log_dirty(kvm); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -5173,8 +5150,8 @@ set_identity_unlock: } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops->mem_enc_op) - r = kvm_x86_ops->mem_enc_op(kvm, argp); + if (kvm_x86_ops.mem_enc_op) + r = kvm_x86_ops.mem_enc_op(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -5185,8 +5162,8 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_reg_region) - r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_reg_region) + r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -5197,8 +5174,8 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_unreg_region) - r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_unreg_region) + r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -5249,28 +5226,28 @@ static void kvm_init_msr_list(void) continue; break; case MSR_TSC_AUX: - if (!kvm_x86_ops->rdtscp_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) continue; break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: - if (!kvm_x86_ops->pt_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) continue; break; case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) continue; break; case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; @@ -5293,7 +5270,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) + if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -5356,13 +5333,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->set_segment(vcpu, var, seg); + kvm_x86_ops.set_segment(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->get_segment(vcpu, var, seg); + kvm_x86_ops.get_segment(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -5382,14 +5359,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5397,7 +5374,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5446,7 +5423,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -5471,7 +5448,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -5492,7 +5469,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = 0; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -5545,7 +5522,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -5608,7 +5585,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -5727,7 +5704,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; /* * If the exit was due to a NPF we may already have a GPA. @@ -5736,10 +5713,9 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, * operation using rep will only have the initial GPA from the NPF * occurred. */ - if (vcpu->arch.gpa_available && - emulator_can_use_gpa(ctxt) && - (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { - gpa = vcpu->arch.gpa_val; + if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && + (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { + gpa = ctxt->gpa_val; ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); } else { ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5959,11 +5935,9 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int ret; if (vcpu->arch.pio.count) @@ -5983,20 +5957,33 @@ data_avail: return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); +} + +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, const void *val, + unsigned int count) +{ memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops->get_segment_base(vcpu, seg); + return kvm_x86_ops.get_segment_base(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -6009,7 +5996,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops->has_wbinvd_exit()) { + if (kvm_x86_ops.has_wbinvd_exit()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -6114,27 +6101,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); + return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -6256,13 +6243,15 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); + return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, + bool exact_only) { - return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) @@ -6292,7 +6281,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); + kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -6308,7 +6297,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); + return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate); } static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) @@ -6370,7 +6359,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -6381,7 +6370,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + kvm_x86_ops.set_interrupt_shadow(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -6389,7 +6378,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) return kvm_propagate_fault(vcpu, &ctxt->exception); @@ -6401,13 +6390,31 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) return false; } +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt; + + ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); + if (!ctxt) { + pr_err("kvm: failed to allocate vcpu's emulator\n"); + return NULL; + } + + ctxt->vcpu = vcpu; + ctxt->ops = &emulate_ops; + vcpu->arch.emulate_ctxt = ctxt; + + return ctxt; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; @@ -6427,7 +6434,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -6466,7 +6473,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -6483,10 +6490,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (!vcpu->arch.mmu->direct_map) { @@ -6574,10 +6582,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6645,10 +6654,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); int r; - r = kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops.skip_emulated_instruction(vcpu); if (unlikely(!r)) return 0; @@ -6740,7 +6749,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; @@ -6830,8 +6839,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } restart: - /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2_or_gpa; + if (emulation_type & EMULTYPE_PF) { + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2_or_gpa; + + /* With shadow page tables, cr2 contains a GVA or nGPA. */ + if (vcpu->arch.mmu->direct_map) { + ctxt->gpa_available = true; + ctxt->gpa_val = cr2_or_gpa; + } + } else { + /* Sanitize the address out of an abundance of paranoia. */ + ctxt->exception.address = 0; + } r = x86_emulate_insn(ctxt); @@ -6872,7 +6892,7 @@ restart: r = 1; if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || @@ -6880,6 +6900,8 @@ restart: kvm_rip_write(vcpu, ctxt->eip); if (r && ctxt->tf) r = kvm_vcpu_do_singlestep(vcpu); + if (kvm_x86_ops.update_emulated_instruction) + kvm_x86_ops.update_emulated_instruction(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -6930,8 +6952,8 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_rax_read(vcpu); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); + int ret = emulator_pio_out(vcpu, size, port, &val, 1); + if (ret) return ret; @@ -6967,11 +6989,10 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform * the copy and tracing */ - emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, - vcpu->arch.pio.port, &val, 1); + emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -6986,8 +7007,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, - &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; @@ -7177,14 +7197,16 @@ static void kvm_timer_init(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; + struct cpufreq_policy *policy; int cpu; - memset(&policy, 0, sizeof(policy)); cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } put_cpu(); #endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, @@ -7208,7 +7230,7 @@ static int kvm_is_user_mode(void) int user_mode = 3; if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); + user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -7285,22 +7307,22 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - struct kvm_x86_ops *ops = opaque; - if (kvm_x86_ops) { + if (kvm_x86_ops.hardware_enable) { printk(KERN_ERR "kvm: already loaded the other module\n"); r = -EEXIST; goto out; } if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support\n"); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: disabled by bios\n"); r = -EOPNOTSUPP; goto out; } @@ -7325,18 +7347,22 @@ int kvm_arch_init(void *opaque) goto out; } + x86_emulator_cache = kvm_alloc_emulator_cache(); + if (!x86_emulator_cache) { + pr_err("kvm: failed to allocate cache for x86 emulator\n"); + goto out_free_x86_fpu_cache; + } + shared_msrs = alloc_percpu(struct kvm_shared_msrs); if (!shared_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out_free_x86_fpu_cache; + goto out_free_x86_emulator_cache; } r = kvm_mmu_module_init(); if (r) goto out_free_percpu; - kvm_x86_ops = ops; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, PT_PRESENT_MASK, 0, sme_me_mask); @@ -7344,8 +7370,10 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + } kvm_lapic_init(); if (pi_inject_timer == -1) @@ -7361,6 +7389,8 @@ int kvm_arch_init(void *opaque) out_free_percpu: free_percpu(shared_msrs); +out_free_x86_emulator_cache: + kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: kmem_cache_destroy(x86_fpu_cache); out: @@ -7383,7 +7413,7 @@ void kvm_arch_exit(void) #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); #endif - kvm_x86_ops = NULL; + kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); free_percpu(shared_msrs); kmem_cache_destroy(x86_fpu_cache); @@ -7521,7 +7551,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops->get_cpl(vcpu) != 0) { + if (kvm_x86_ops.get_cpl(vcpu) != 0) { ret = -KVM_EPERM; goto out; } @@ -7567,7 +7597,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); + kvm_x86_ops.patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -7596,7 +7626,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; - if (!kvm_x86_ops->update_cr8_intercept) + if (!kvm_x86_ops.update_cr8_intercept) return; if (!lapic_in_kernel(vcpu)) @@ -7615,17 +7645,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) +static int inject_pending_event(struct kvm_vcpu *vcpu) { int r; /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) - kvm_x86_ops->queue_exception(vcpu); + kvm_x86_ops.queue_exception(vcpu); /* * Do not inject an NMI or interrupt if there is a pending * exception. Exceptions and interrupts are recognized at @@ -7642,9 +7672,9 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) - kvm_x86_ops->set_nmi(vcpu); + kvm_x86_ops.set_nmi(vcpu); else if (vcpu->arch.interrupt.injected) - kvm_x86_ops->set_irq(vcpu); + kvm_x86_ops.set_irq(vcpu); } /* @@ -7653,8 +7683,8 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { + r = kvm_x86_ops.check_nested_events(vcpu); if (r != 0) return r; } @@ -7691,7 +7721,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } } - kvm_x86_ops->queue_exception(vcpu); + kvm_x86_ops.queue_exception(vcpu); } /* Don't consider new event if we re-injected an event */ @@ -7699,14 +7729,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) return 0; if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops->smi_allowed(vcpu)) { + kvm_x86_ops.smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); + kvm_x86_ops.set_nmi(vcpu); } else if (kvm_cpu_has_injectable_intr(vcpu)) { /* * Because interrupts can be injected asynchronously, we are @@ -7715,15 +7745,15 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * proposal and current concerns. Perhaps we should be setting * KVM_REQ_EVENT only on certain events and not unconditionally? */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { + r = kvm_x86_ops.check_nested_events(vcpu); if (r != 0) return r; } - if (kvm_x86_ops->interrupt_allowed(vcpu)) { + if (kvm_x86_ops.interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - kvm_x86_ops->set_irq(vcpu); + kvm_x86_ops.set_irq(vcpu); } } @@ -7739,7 +7769,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -7829,11 +7859,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -7883,7 +7913,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -7893,7 +7923,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -7923,28 +7953,28 @@ static void enter_smm(struct kvm_vcpu *vcpu) * vCPU state (e.g. leave guest mode) after we've saved the state into * the SMM state-save area. */ - kvm_x86_ops->pre_enter_smm(vcpu, buf); + kvm_x86_ops.pre_enter_smm(vcpu, buf); vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops->get_nmi_mask(vcpu)) + if (kvm_x86_ops.get_nmi_mask(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops->set_nmi_mask(vcpu, true); + kvm_x86_ops.set_nmi_mask(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops->set_cr4(vcpu, 0); + kvm_x86_ops.set_cr4(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); __kvm_set_dr(vcpu, 7, DR7_FIXED_1); @@ -7975,7 +8005,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops->set_efer(vcpu, 0); + kvm_x86_ops.set_efer(vcpu, 0); #endif kvm_update_cpuid(vcpu); @@ -8013,7 +8043,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); + kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -8026,23 +8056,30 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - if (!kvm_x86_ops->check_apicv_inhibit_reasons || - !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) + unsigned long old, new, expected; + + if (!kvm_x86_ops.check_apicv_inhibit_reasons || + !kvm_x86_ops.check_apicv_inhibit_reasons(bit)) return; - if (activate) { - if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - !kvm_apicv_activated(kvm)) - return; - } else { - if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - kvm_apicv_activated(kvm)) - return; - } + old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); + do { + expected = new = old; + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + if (new == old) + break; + old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); + } while (old != expected); + + if (!!old == !!new) + return; trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops->pre_update_apicv_exec_ctrl) - kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate); + if (kvm_x86_ops.pre_update_apicv_exec_ctrl) + kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -8058,7 +8095,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -8078,7 +8115,7 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); } int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -8105,13 +8142,13 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops->set_apic_access_page_addr) + if (!kvm_x86_ops.set_apic_access_page_addr) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); /* * Do not pin apic access page in memory, the MMU notifier @@ -8143,7 +8180,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@ -8163,8 +8200,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) - kvm_mmu_load_cr3(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) + kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { @@ -8249,7 +8286,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu, req_int_win) != 0) + if (inject_pending_event(vcpu) != 0) req_immediate_exit = true; else { /* Enable SMI/NMI/IRQ window open exits if needed. @@ -8267,12 +8304,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * SMI. */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops->enable_smi_window(vcpu)) + if (!kvm_x86_ops.enable_smi_window(vcpu)) req_immediate_exit = true; if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); + kvm_x86_ops.enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + kvm_x86_ops.enable_irq_window(vcpu); WARN_ON(vcpu->arch.exception.pending); } @@ -8289,7 +8326,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_x86_ops.prepare_guest_switch(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -8320,7 +8357,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * notified with kvm_vcpu_kick. */ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { @@ -8335,7 +8372,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops->request_immediate_exit(vcpu); + kvm_x86_ops.request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); @@ -8355,7 +8392,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops->run(vcpu); + kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -8365,7 +8402,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops->sync_dirty_debug_regs(vcpu); + kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); @@ -8387,7 +8424,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); /* * Consume any pending interrupts, including the possible source of @@ -8430,12 +8467,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); + r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath); return r; cancel_injection: - kvm_x86_ops->cancel_injection(vcpu); + kvm_x86_ops.cancel_injection(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -8445,13 +8481,13 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { + (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); + if (kvm_x86_ops.post_block) + kvm_x86_ops.post_block(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -8471,15 +8507,14 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) break; default: return -EINTR; - break; } return 1; } static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) + kvm_x86_ops.check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -8635,7 +8670,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ + /* PKRU is separately restored in kvm_x86_ops.run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8740,7 +8775,7 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_rax_read(vcpu); @@ -8838,10 +8873,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -8926,7 +8961,7 @@ out: int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -8942,7 +8977,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -8989,10 +9023,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); + kvm_x86_ops.set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; @@ -9002,16 +9036,16 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_x86_ops.set_efer(vcpu, sregs->efer); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + kvm_x86_ops.set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)); - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) kvm_update_cpuid(vcpu); @@ -9117,7 +9151,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_bp_intercept(vcpu); + kvm_x86_ops.update_bp_intercept(vcpu); r = 0; @@ -9259,7 +9293,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -9297,11 +9330,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; + if (!alloc_emulate_ctxt(vcpu)) + goto free_wbinvd_dirty_mask; + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_wbinvd_dirty_mask; + goto free_emulate_ctxt; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -9326,7 +9362,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_hv_vcpu_init(vcpu); - r = kvm_x86_ops->vcpu_create(vcpu); + r = kvm_x86_ops.vcpu_create(vcpu); if (r) goto free_guest_fpu; @@ -9343,6 +9379,8 @@ free_guest_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_emulate_ctxt: + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: @@ -9377,11 +9415,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -9393,8 +9429,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + kvm_x86_ops.vcpu_free(vcpu); + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); @@ -9480,7 +9517,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; - kvm_x86_ops->vcpu_reset(vcpu, init_event); + kvm_x86_ops.vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -9505,7 +9542,7 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); + ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -9587,18 +9624,29 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(); + kvm_x86_ops.hardware_disable(); drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - r = kvm_x86_ops->hardware_setup(); + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + r = ops->hardware_setup(); if (r != 0) return r; + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); if (kvm_has_tsc_control) { @@ -9615,28 +9663,26 @@ int kvm_arch_hardware_setup(void) kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_msr_list(); return 0; } void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops.hardware_unsetup(); } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct kvm_x86_init_ops *ops = opaque; WARN_ON(!irqs_disabled()); if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) return -EIO; - return kvm_x86_ops->check_processor_compatibility(); + return ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -9662,9 +9708,16 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) pmu->need_cleanup = true; kvm_make_request(KVM_REQ_PMU, vcpu); } - kvm_x86_ops->sched_in(vcpu, cpu); + kvm_x86_ops.sched_in(vcpu, cpu); } +void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm->arch.hyperv.hv_pa_pg); + vfree(kvm); +} + + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { if (type) @@ -9699,7 +9752,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - return kvm_x86_ops->vm_init(kvm); + return kvm_x86_ops.vm_init(kvm); } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -9747,9 +9800,9 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + unsigned long hva, uninitialized_var(old_npages); struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *slot, old; + struct kvm_memory_slot *slot; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) @@ -9757,7 +9810,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) slot = id_to_memslot(slots, id); if (size) { - if (slot->npages) + if (slot && slot->npages) return -EEXIST; /* @@ -9769,13 +9822,18 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) if (IS_ERR((void *)hva)) return PTR_ERR((void *)hva); } else { - if (!slot->npages) + if (!slot || !slot->npages) return 0; - hva = 0; + /* + * Stuff a non-canonical value to catch use-after-delete. This + * ends up being 0 on 32-bit KVM, but there's no better + * alternative. + */ + hva = (unsigned long)(0xdeadull << 48); + old_npages = slot->npages; } - old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; @@ -9790,7 +9848,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } if (!size) - vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + vm_munmap(hva, old_npages * PAGE_SIZE); return 0; } @@ -9817,8 +9875,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops->vm_destroy) - kvm_x86_ops->vm_destroy(kvm); + if (kvm_x86_ops.vm_destroy) + kvm_x86_ops.vm_destroy(kvm); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -9829,34 +9887,36 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } + kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) continue; - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } - kvm_page_track_free_memslot(free, dont); + kvm_page_track_free_memslot(slot); } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, + unsigned long npages) { int i; + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; @@ -9887,11 +9947,9 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot + * other, disable large page support for this slot. */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { unsigned long j; for (j = 0; j < lpages; ++j) @@ -9938,6 +9996,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) + return kvm_alloc_memslot_metadata(memslot, + mem->memory_size >> PAGE_SHIFT); return 0; } @@ -9946,14 +10007,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); return; } /* * Call kvm_x86_ops dirty logging hooks when they are valid. * - * kvm_x86_ops->slot_disable_log_dirty is called when: + * kvm_x86_ops.slot_disable_log_dirty is called when: * * - KVM_MR_CREATE with dirty logging is disabled * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag @@ -9965,7 +10026,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. * - * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot + * kvm_x86_ops.slot_enable_log_dirty is called when switching new slot * to dirty logging mode. * * If kvm_x86_ops dirty logging hooks are invalid, use write protect. @@ -9981,19 +10042,32 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) - kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + if (kvm_x86_ops.slot_enable_log_dirty) { + kvm_x86_ops.slot_enable_log_dirty(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } } else { - if (kvm_x86_ops->slot_disable_log_dirty) - kvm_x86_ops->slot_disable_log_dirty(kvm, new); + if (kvm_x86_ops.slot_disable_log_dirty) + kvm_x86_ops.slot_disable_log_dirty(kvm, new); } } void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -10035,6 +10109,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if (change != KVM_MR_DELETE) kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + + /* Free the arrays associated with the old memslot. */ + if (change == KVM_MR_MOVE) + kvm_arch_free_memslot(kvm, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -10051,8 +10129,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops->guest_apic_has_interrupt && - kvm_x86_ops->guest_apic_has_interrupt(vcpu)); + kvm_x86_ops.guest_apic_has_interrupt && + kvm_x86_ops.guest_apic_has_interrupt(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -10071,7 +10149,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops->nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || @@ -10104,7 +10182,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu)) return true; return false; @@ -10122,7 +10200,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10144,7 +10222,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = kvm_x86_ops->get_rflags(vcpu); + rflags = kvm_x86_ops.get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -10156,7 +10234,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_x86_ops.set_rflags(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -10179,10 +10257,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) + work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) @@ -10267,7 +10345,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) + kvm_x86_ops.get_cpl(vcpu) == 0)) return false; return true; @@ -10287,7 +10365,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -10416,7 +10494,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; - return kvm_x86_ops->update_pi_irte(irqfd->kvm, + return kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 1); } @@ -10436,7 +10514,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -10445,7 +10523,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) @@ -10502,4 +10580,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); |