diff options
Diffstat (limited to 'arch/x86/kvm')
34 files changed, 1350 insertions, 821 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8ea8c097de 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM_X86 select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG + select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2cbb3874ad39..571c906ffcbf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -58,25 +58,24 @@ void __init kvm_init_xstate_sizes(void) u32 xstate_required_size(u64 xstate_bv, bool compacted) { - int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + int i; xstate_bv &= XFEATURE_MASK_EXTEND; - while (xstate_bv) { - if (xstate_bv & 0x1) { - struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; - u32 offset; - - /* ECX[1]: 64B alignment in compacted form */ - if (compacted) - offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; - else - offset = xs->ebx; - ret = max(ret, offset + xs->eax); - } + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + u32 offset; - xstate_bv >>= 1; - feature_bit++; + if (!(xstate_bv & BIT_ULL(i))) + continue; + + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = xs->ebx; + ret = max(ret, offset + xs->eax); + xstate_bv &= ~BIT_ULL(i); } return ret; @@ -196,6 +195,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) } static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu); +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, @@ -300,10 +300,12 @@ static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, guest_cpu_cap_change(vcpu, x86_feature, has_feature); } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + vcpu->arch.cpuid_dynamic_bits_dirty = false; + best = kvm_find_cpuid_entry(vcpu, 1); if (best) { kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE, @@ -333,7 +335,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); } -EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { @@ -646,6 +647,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; @@ -1180,7 +1184,7 @@ void kvm_set_cpu_caps(void) SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), - SYNTHESIZED_F(SRSO_USER_KERNEL_NO), + F(SRSO_USER_KERNEL_NO), ); kvm_cpu_cap_init(CPUID_8000_0022_EAX, @@ -1423,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1440,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1704,7 +1706,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) phys_as = entry->eax & 0xff; g_phys_as = phys_as; if (kvm_mmu_get_max_tdp_level() < 5) - g_phys_as = min(g_phys_as, 48); + g_phys_as = min(g_phys_as, 48U); } entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); @@ -1759,23 +1761,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { - entry->eax = entry->ebx; + entry->eax = entry->ebx = 0; break; } cpuid_entry_override(entry, CPUID_8000_0022_EAX); - if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) - ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; - else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; - else - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; - + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; entry->ebx = ebx.full; break; } @@ -1985,6 +1981,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; @@ -2000,12 +1999,29 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, *edx = entry->edx; if (function == 7 && index == 0) { u64 data; - if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && + !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { if (kvm_hv_invtsc_suppressed(vcpu)) *edx &= ~feature_bit(CONSTANT_TSC); + } else if (IS_ENABLED(CONFIG_KVM_XEN) && + kvm_xen_is_tsc_leaf(vcpu, function)) { + /* + * Update guest TSC frequency information if necessary. + * Ignore failures, there is no sane value that can be + * provided if KVM can't get the TSC frequency. + */ + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) + kvm_guest_time_update(vcpu); + + if (index == 1) { + *ecx = vcpu->arch.pvclock_tsc_mul; + *edx = vcpu->arch.pvclock_tsc_shift; + } else if (index == 2) { + *eax = vcpu->arch.hw_tsc_khz; + } } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 67d80aa72d50..d2884162a46a 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,7 +11,6 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, @@ -232,6 +231,14 @@ static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu, { unsigned int x86_leaf = __feature_leaf(x86_feature); + /* + * Except for MWAIT, querying dynamic feature bits is disallowed, so + * that KVM can defer runtime updates until the next CPUID emulation. + */ + BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC || + x86_feature == X86_FEATURE_OSXSAVE || + x86_feature == X86_FEATURE_OSPKE); + return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature); } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 60986f67c35a..1349e278cd2a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -477,8 +477,11 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, + .src_type = ctxt->src.type, + .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, - .next_rip = ctxt->eip, + .rip = ctxt->eip, + .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a6dd5a84f22..24f0318c50d7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -952,8 +952,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } @@ -2226,6 +2225,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2854,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d7ab8780ab9e..739aa6c0d0c3 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -690,8 +690,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->kvm = kvm; pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; + hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8dec646e764b..a8fb19940975 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -567,7 +567,7 @@ static void pic_irq_request(struct kvm *kvm, int level) { struct kvm_pic *s = kvm->arch.vpic; - if (!s->output) + if (!s->output && level) s->wakeup_needed = true; s->output = level; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 73072585e164..c1df5acfacaf 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -44,7 +44,10 @@ struct x86_instruction_info { u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ + u8 src_type; /* type of source operand */ + u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ + u64 rip; /* rip of the instruction */ u64 next_rip; /* rip following the instruction */ }; @@ -272,8 +275,10 @@ struct operand { }; }; +#define X86_MAX_INSTRUCTION_LENGTH 15 + struct fetch_cache { - u8 data[15]; + u8 data[X86_MAX_INSTRUCTION_LENGTH]; u8 *ptr; u8 *end; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a009c94c26c2..28e3317124fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -221,13 +221,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) @@ -489,7 +482,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree_rcu(old, rcu); kvm_make_scan_ioapic_request(kvm); } @@ -2593,7 +2586,7 @@ static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; if (!apic) return; @@ -2921,9 +2914,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - apic->lapic_timer.timer.function = apic_timer_fn; + hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; @@ -3397,9 +3389,9 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -3408,7 +3400,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) sipi_vector = apic->sipi_vector; kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } } return 0; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a45ae60e84ab..63bb77ee1bb1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return false; } - if (!spte_has_volatile_bits(old_spte)) + if (!spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); @@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || - !spte_has_volatile_bits(old_spte)) + !spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); @@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings. */ #define KVM_RMAP_MANY BIT(0) /* + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always + * operates with mmu_lock held for write), but rmaps can be walked without + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain + * being zapped/dropped _while the rmap is locked_. + * + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be + * done while holding mmu_lock for write. This allows a task walking rmaps + * without holding mmu_lock to concurrently walk the same entries as a task + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify + * the rmaps, thus the walks are stable. + * + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, + * only the rmap chains themselves are protected. E.g. holding an rmap's lock + * ensures all "struct pte_list_desc" fields are stable. + */ +#define KVM_RMAP_LOCKED BIT(1) + +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + unsigned long old_val, new_val; + + lockdep_assert_preemption_disabled(); + + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ + old_val = atomic_long_read(&rmap_head->val); + if (!old_val) + return 0; + + do { + /* + * If the rmap is locked, wait for it to be unlocked before + * trying acquire the lock, e.g. to avoid bouncing the cache + * line. + */ + while (old_val & KVM_RMAP_LOCKED) { + cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); + } + + /* + * Recheck for an empty rmap, it may have been purged by the + * task that held the lock. + */ + if (!old_val) + return 0; + + new_val = old_val | KVM_RMAP_LOCKED; + /* + * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap + * from being reordered outside of the critical section created by + * __kvm_rmap_lock(). + * + * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). + * + * For the !old_val case, no ordering is needed, as there is no rmap + * to walk. + */ + } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val)); + + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ + return old_val; +} + +static unsigned long kvm_rmap_lock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + return __kvm_rmap_lock(rmap_head); +} + +static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, + unsigned long val) +{ + KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); + /* + * Ensure that all accesses to the rmap have completed before unlocking + * the rmap. + * + * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock(). + */ + atomic_long_set_release(&rmap_head->val, val); +} + +static void kvm_rmap_unlock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + unsigned long new_val) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + __kvm_rmap_unlock(rmap_head, new_val); +} + +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) +{ + return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED; +} + +/* + * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The + * actual locking is the same, but the caller is disallowed from modifying the + * rmap, and so the unlock flow is a nop if the rmap is/was empty. + */ +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + unsigned long rmap_val; + + preempt_disable(); + rmap_val = __kvm_rmap_lock(rmap_head); + + if (!rmap_val) + preempt_enable(); + + return rmap_val; +} + +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); + + __kvm_rmap_unlock(rmap_head, old_val); + preempt_enable(); +} + +/* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, - struct kvm_rmap_head *rmap_head) +static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + u64 *spte, struct kvm_rmap_head *rmap_head) { + unsigned long old_val, new_val; struct pte_list_desc *desc; int count = 0; - if (!rmap_head->val) { - rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & KVM_RMAP_MANY)) { + old_val = kvm_rmap_lock(kvm, rmap_head); + + if (!old_val) { + new_val = (unsigned long)spte; + } else if (!(old_val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->sptes[0] = (u64 *)rmap_head->val; + desc->sptes[0] = (u64 *)old_val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; + } else { + new_val = old_val; } desc->sptes[desc->spte_count++] = spte; } + + kvm_rmap_unlock(kvm, rmap_head, new_val); + return count; } -static void pte_list_desc_remove_entry(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) - rmap_head->val = 0; + *rmap_val = 0; else - rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; + *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; + unsigned long rmap_val; int i; - if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) - return; + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) + goto out; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) - return; + if (!(rmap_val & KVM_RMAP_MANY)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm)) + goto out; - rmap_head->val = 0; + rmap_val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(kvm, rmap_head, + pte_list_desc_remove_entry(kvm, &rmap_val, desc, i); - return; + goto out; } } desc = desc->more; @@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } + +out: + kvm_rmap_unlock(kvm, rmap_head, rmap_val); } static void kvm_zap_one_rmap_spte(struct kvm *kvm, @@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; + unsigned long rmap_val; int i; - if (!rmap_head->val) + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (!rmap_val) return false; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + if (!(rmap_val & KVM_RMAP_MANY)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + kvm_rmap_unlock(kvm, rmap_head, 0); return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { + unsigned long rmap_val = kvm_rmap_get(rmap_head); struct pte_list_desc *desc; - if (!rmap_head->val) + if (!rmap_val) return 0; - else if (!(rmap_head->val & KVM_RMAP_MANY)) + else if (!(rmap_val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) */ struct rmap_iterator { /* private fields */ + struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; @@ -1067,23 +1221,19 @@ struct rmap_iterator { static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { - u64 *sptep; + unsigned long rmap_val = kvm_rmap_get(rmap_head); - if (!rmap_head->val) + if (!rmap_val) return NULL; - if (!(rmap_head->val & KVM_RMAP_MANY)) { + if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_head->val; - goto out; + return (u64 *)rmap_val; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1093,14 +1243,11 @@ out: */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \ + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (spte_ad_need_write_protect(*sptep)) flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); + } return flush; } @@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); - if (iterator->rmap->val) + if (atomic_long_read(&iterator->rmap->val)) return; } @@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm, kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(cache, spte, rmap_head); + rmap_count = pte_list_add(kvm, cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; @@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, bool test_only) + struct kvm_gfn_range *range, + bool test_only) { - struct slot_rmap_walk_iterator iterator; + struct kvm_rmap_head *rmap_head; struct rmap_iterator iter; + unsigned long rmap_val; bool young = false; u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) { - for_each_rmap_spte(iterator.rmap, &iter, sptep) { - u64 spte = *sptep; + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); - if (!is_accessed_spte(spte)) - continue; + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; - if (test_only) - return true; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * WARN if mmu_spte_update() signals the need - * for a TLB flush, as Access tracking a SPTE - * should never trigger an _immediate_ flush. - */ - spte = mark_spte_for_access_track(spte); - WARN_ON_ONCE(mmu_spte_update(sptep, spte)); + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } + + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; } - young = true; + + kvm_rmap_unlock_readonly(rmap_head, rmap_val); } } return young; } +static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm) +{ + return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, false); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_age_gfn_range(kvm, range); + young = kvm_tdp_mmu_age_gfn_range(kvm, range); + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, false); return young; } @@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, true); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_test_age_gfn(kvm, range); + young = kvm_tdp_mmu_test_age_gfn(kvm, range); + + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, true); return young; } @@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, +static void mmu_page_add_parent_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(cache, parent_pte, &sp->parent_ptes); + pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm, mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(cache, sp, sptep); + mmu_page_add_parent_pte(kvm, cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For @@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.guest_mode && !child->parent_ptes.val) + child->role.guest_mode && + !atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } @@ -5540,7 +5713,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); @@ -7120,6 +7293,19 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_wake_nx_recovery_thread(struct kvm *kvm) +{ + /* + * The NX recovery thread is spawned on-demand at the first KVM_RUN and + * may not be valid even though the VM is globally visible. Do nothing, + * as such a VM can't have any possible NX huge pages. + */ + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); + + if (nx_thread) + vhost_task_wake(nx_thread); +} + static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) @@ -7180,7 +7366,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } @@ -7315,7 +7501,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } @@ -7447,18 +7633,25 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); + struct vhost_task *nx_thread; kvm->arch.nx_huge_page_last = get_jiffies_64(); - kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( - kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, - kvm, "kvm-nx-lpage-recovery"); + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, + kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - if (kvm->arch.nx_huge_page_recovery_thread) - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + if (IS_ERR(nx_thread)) + return PTR_ERR(nx_thread); + + vhost_task_start(nx_thread); + + /* Make the task visible only once it is fully started. */ + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7466,10 +7659,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f4711674c47b..68e323568e95 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -510,8 +510,7 @@ error: * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ - walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << - EPT_VIOLATION_RWX_SHIFT; + walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 22551e2f1d00..0f9f47b4ab0e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } /* - * Returns true if the SPTE has bits that may be set without holding mmu_lock. - * The caller is responsible for checking if the SPTE is shadow-present, and - * for determining whether or not the caller cares about non-leaf SPTEs. + * Returns true if the SPTE needs to be updated atomically due to having bits + * that may be changed without holding mmu_lock, and for which KVM must not + * lose information. E.g. KVM must not drop Dirty bit information. The caller + * is responsible for checking if the SPTE is shadow-present, and for + * determining whether or not the caller cares about non-leaf SPTEs. */ -bool spte_has_volatile_bits(u64 spte) +bool spte_needs_atomic_update(u64 spte) { + /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; - if (is_access_track_spte(spte)) + /* + * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked + * SPTEs can be restored by KVM's fast page fault handler. + */ + if (!spte_ad_enabled(spte)) return true; - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; + /* + * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed + * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't + * invalidate TLBs when aging SPTEs, and so it's safe to clobber the + * Accessed bit (and rare in practice). + */ + return is_writable_pte(spte) && !(spte & shadow_dirty_mask); } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 59746854c0af..79cdceba9857 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -bool spte_has_volatile_bits(u64 spte); +bool spte_needs_atomic_update(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 047b78333653..364c5da6c499 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) return xchg(rcu_dereference(sptep), new_spte); } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); @@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) } /* - * SPTEs must be modified atomically if they are shadow-present, leaf - * SPTEs, and have volatile bits, i.e. has bits that can be set outside - * of mmu_lock. The Writable bit can be set by KVM's fast page fault - * handler, and Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs, + * and have volatile bits (bits that can be set outside of mmu_lock) that + * must not be clobbered. */ -static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level) { return is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte); + spte_needs_atomic_update(old_spte); } static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); @@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level) { - atomic64_t *sptep_atomic; - - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { - sptep_atomic = (atomic64_t *)rcu_dereference(sptep); - return (u64)atomic64_fetch_and(~mask, sptep_atomic); - } + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); return old_spte; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 046b6ba31197..21a3b8166242 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -193,6 +195,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, !tdp_mmu_root_match((_root), (_types)))) { \ } else +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ + } else + #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) @@ -312,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** @@ -774,9 +793,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, continue; \ else -#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \ - for_each_tdp_pte(_iter, _kvm, _root, _start, _end) - static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, struct tdp_iter *iter) { @@ -1235,7 +1251,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); - tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { + for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { int r; if (fault->nx_huge_page_workaround_enabled) @@ -1332,21 +1348,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; if (spte_ad_enabled(iter->old_spte)) { - iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, - iter->old_spte, - shadow_accessed_mask, - iter->level); + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { new_spte = mark_spte_for_access_track(iter->old_spte); - iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, - iter->old_spte, new_spte, - iter->level); + /* + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. + */ + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, @@ -1371,9 +1388,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, * valid roots! */ WARN_ON(types & ~KVM_VALID_ROOTS); - __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) { - guard(rcu)(); + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { if (!is_accessed_spte(iter.old_spte)) continue; @@ -1382,7 +1399,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, return true; ret = true; - kvm_tdp_mmu_age_spte(&iter); + kvm_tdp_mmu_age_spte(kvm, &iter); } } @@ -1904,7 +1921,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->root_role.level; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } @@ -1931,7 +1948,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, struct tdp_iter iter; tdp_ptep_t sptep = NULL; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index e0ab7df27b66..699e551ec93b 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -358,7 +358,7 @@ void enter_smm(struct kvm_vcpu *vcpu) goto error; #endif - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; kvm_mmu_reset_context(vcpu); return; error: diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d77b094d9a4d..834b67672d50 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. @@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -994,7 +994,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2a794c32050..0bc708ee2788 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -140,7 +140,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm) static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } @@ -226,9 +226,7 @@ e_uncharge: static unsigned int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; + return to_kvm_sev_info(kvm)->asid; } static void sev_asid_free(struct kvm_sev_info *sev) @@ -403,7 +401,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_init *data, unsigned long vm_type) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; bool es_active = vm_type != KVM_X86_SEV_VM; u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; @@ -500,10 +498,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_init data; - if (!sev->need_init) + if (!to_kvm_sev_info(kvm)->need_init) return -EINVAL; if (kvm->arch.vm_type != KVM_X86_SEV_VM && @@ -543,14 +540,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error) static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -622,9 +619,9 @@ e_free_dh: static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); unsigned long npages, size; int npinned; unsigned long locked, lock_limit; @@ -663,7 +660,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -686,11 +683,9 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unpin_user_pages(pages, npages); kvfree(pages); - sev->pages_locked -= npages; + to_kvm_sev_info(kvm)->pages_locked -= npages; } static void sev_clflush_pages(struct page *pages[], unsigned long npages) @@ -734,7 +729,6 @@ static unsigned long get_num_contig_pages(unsigned long idx, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data data; struct page **inpages; @@ -751,7 +745,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -762,7 +756,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_clflush_pages(inpages, npages); data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -802,7 +796,7 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); struct sev_es_save_area *save = svm->sev_es.vmsa; struct xregs_state *xsave; const u8 *s; @@ -972,7 +966,6 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *measure = u64_to_user_ptr(argp->data); - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; @@ -1005,7 +998,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* @@ -1033,19 +1026,17 @@ e_free_blob: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status data; int ret; @@ -1055,7 +1046,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) return ret; @@ -1074,11 +1065,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg data; data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; data.dst_addr = dst; data.src_addr = src; data.len = size; @@ -1250,7 +1240,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1302,7 +1292,6 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; @@ -1316,7 +1305,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1358,7 +1347,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) data.hdr_address = __psp_pa(hdr); data.hdr_len = params.hdr_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); @@ -1378,7 +1367,6 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *report = u64_to_user_ptr(argp->data); - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; @@ -1411,7 +1399,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -1441,12 +1429,11 @@ static int __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_start *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; @@ -1459,7 +1446,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; struct kvm_sev_send_start params; void *amd_certs, *session_data; @@ -1520,7 +1506,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) data.amd_certs_len = params.amd_certs_len; data.session_address = __psp_pa(session_data); data.session_len = params.session_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); @@ -1552,12 +1538,11 @@ static int __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_update_data *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); params->hdr_len = data.hdr_len; @@ -1572,7 +1557,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; @@ -1626,7 +1610,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); @@ -1657,31 +1641,29 @@ e_unpin: static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); } static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_cancel data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); } static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_receive_start start; struct kvm_sev_receive_start params; int *error = &argp->error; @@ -1755,7 +1737,6 @@ e_free_pdh: static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; @@ -1798,7 +1779,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1815,7 +1796,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); @@ -1832,13 +1813,12 @@ e_free_hdr: static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_receive_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } @@ -1858,8 +1838,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); int r = -EBUSY; if (dst_kvm == src_kvm) @@ -1893,8 +1873,8 @@ release_dst: static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); mutex_unlock(&dst_kvm->lock); mutex_unlock(&src_kvm->lock); @@ -1968,8 +1948,8 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; struct kvm_sev_info *mirror; @@ -2009,8 +1989,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * and add the new mirror to the list. */ if (is_mirroring_enc_context(dst_kvm)) { - struct kvm_sev_info *owner_sev_info = - &to_kvm_svm(dst->enc_context_owner)->sev_info; + struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); list_del(&src->mirror_entry); list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); @@ -2069,7 +2048,7 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); struct kvm_sev_info *src_sev, *cg_cleanup_sev; CLASS(fd, f)(source_fd); struct kvm *source_kvm; @@ -2093,7 +2072,7 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) goto out_unlock; } - src_sev = &to_kvm_svm(source_kvm)->sev_info; + src_sev = to_kvm_sev_info(source_kvm); dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; @@ -2181,7 +2160,7 @@ static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) static int snp_bind_asid(struct kvm *kvm, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_activate data = {0}; data.gctx_paddr = __psp_pa(sev->snp_context); @@ -2191,7 +2170,7 @@ static int snp_bind_asid(struct kvm *kvm, int *error) static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_launch_start start = {0}; struct kvm_sev_snp_launch_start params; int rc; @@ -2260,7 +2239,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf void __user *src, int order, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); int n_private = 0, ret, i; int npages = (1 << order); gfn_t gfn; @@ -2350,7 +2329,7 @@ err: static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_gmem_populate_args sev_populate_args = {0}; struct kvm_sev_snp_launch_update params; struct kvm_memory_slot *memslot; @@ -2434,7 +2413,7 @@ out: static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_launch_update data = {}; struct kvm_vcpu *vcpu; unsigned long i; @@ -2482,7 +2461,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct kvm_sev_snp_launch_finish params; struct sev_data_snp_launch_finish *data; void *id_block = NULL, *id_auth = NULL; @@ -2677,7 +2656,7 @@ out: int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0; @@ -2696,7 +2675,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); @@ -2729,7 +2709,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -2824,9 +2804,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ - source_sev = &to_kvm_svm(source_kvm)->sev_info; + source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2854,7 +2834,7 @@ e_unlock: static int snp_decommission_context(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_addr data = {}; int ret; @@ -2879,7 +2859,7 @@ static int snp_decommission_context(struct kvm *kvm) void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -2972,6 +2952,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); @@ -3261,7 +3251,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -3420,8 +3410,7 @@ vmgexit_err: dump_ghcb(svm); } - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); + svm_vmgexit_bad_input(svm, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -3462,10 +3451,19 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) svm->sev_es.ghcb = NULL; } -void pre_sev_run(struct vcpu_svm *svm, int cpu) +int pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - unsigned int asid = sev_get_asid(svm->vcpu.kvm); + struct kvm *kvm = svm->vcpu.kvm; + unsigned int asid = sev_get_asid(kvm); + + /* + * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid + * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP + * AP Destroy event. + */ + if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) + return -EINVAL; /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -3478,11 +3476,12 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) */ if (sd->sev_vmcbs[asid] == svm->vmcb && svm->vcpu.arch.last_vmentry_cpu == cpu) - return; + return 0; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + return 0; } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) @@ -3564,8 +3563,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -3665,7 +3663,14 @@ static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) svm->sev_es.psc_inflight = 0; svm->sev_es.psc_idx = 0; svm->sev_es.psc_2m = false; - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret); + + /* + * PSC requests always get a "no action" response in SW_EXITINFO1, with + * a PSC-specific return code in SW_EXITINFO2 that provides the "real" + * return code. E.g. if the PSC request was interrupted, the need to + * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. + */ + svm_vmgexit_no_action(svm, psc_ret); } static void __snp_complete_one_psc(struct vcpu_svm *svm) @@ -3837,110 +3842,90 @@ next_range: BUG(); } -static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_memory_slot *slot; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!sev_snp_guest(vcpu->kvm)) + return; - WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); + guard(mutex)(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + return; + + svm->sev_es.snp_ap_waiting_for_reset = false; /* Mark the vCPU as offline and not runnable */ vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); /* Clear use of the VMSA */ svm->vmcb->control.vmsa_pa = INVALID_PAGE; - if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { - gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); - struct kvm_memory_slot *slot; - struct page *page; - kvm_pfn_t pfn; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot) - return -EINVAL; - - /* - * The new VMSA will be private memory guest memory, so - * retrieve the PFN from the gmem backend. - */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) - return -EINVAL; - - /* - * From this point forward, the VMSA will always be a - * guest-mapped page rather than the initial one allocated - * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa - * could be free'd and cleaned up here, but that involves - * cleanups like wbinvd_on_all_cpus() which would ideally - * be handled during teardown rather than guest boot. - * Deferring that also allows the existing logic for SEV-ES - * VMSAs to be re-used with minimal SNP-specific changes. - */ - svm->sev_es.snp_has_guest_vmsa = true; - - /* Use the new VMSA */ - svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); - - /* Mark the vCPU as runnable */ - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - - svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - - /* - * gmem pages aren't currently migratable, but if this ever - * changes then care should be taken to ensure - * svm->sev_es.vmsa is pinned through some other means. - */ - kvm_release_page_clean(page); - } - /* * When replacing the VMSA during SEV-SNP AP creation, * mark the VMCB dirty so that full state is always reloaded. */ vmcb_mark_all_dirty(svm->vmcb); - return 0; -} + if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) + return; -/* - * Invoked as part of svm_vcpu_reset() processing of an init event. - */ -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int ret; + gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - if (!sev_snp_guest(vcpu->kvm)) + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) return; - mutex_lock(&svm->sev_es.snp_vmsa_mutex); + /* + * The new VMSA will be private memory guest memory, so retrieve the + * PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) + return; - if (!svm->sev_es.snp_ap_waiting_for_reset) - goto unlock; + /* + * From this point forward, the VMSA will always be a guest-mapped page + * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In + * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but + * that involves cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. Deferring that + * also allows the existing logic for SEV-ES VMSAs to be re-used with + * minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; - svm->sev_es.snp_ap_waiting_for_reset = false; + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); - ret = __sev_snp_update_protected_guest_state(vcpu); - if (ret) - vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); + /* Mark the vCPU as runnable */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); -unlock: - mutex_unlock(&svm->sev_es.snp_vmsa_mutex); + /* + * gmem pages aren't currently migratable, but if this ever changes + * then care should be taken to ensure svm->sev_es.vmsa is pinned + * through some other means. + */ + kvm_release_page_clean(page); } static int sev_snp_ap_creation(struct vcpu_svm *svm) { - struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct kvm_vcpu *vcpu = &svm->vcpu; struct kvm_vcpu *target_vcpu; struct vcpu_svm *target_svm; unsigned int request; unsigned int apic_id; - bool kick; - int ret; request = lower_32_bits(svm->vmcb->control.exit_info_1); apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); @@ -3953,47 +3938,23 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) return -EINVAL; } - ret = 0; - target_svm = to_svm(target_vcpu); - /* - * The target vCPU is valid, so the vCPU will be kicked unless the - * request is for CREATE_ON_INIT. For any errors at this stage, the - * kick will place the vCPU in an non-runnable state. - */ - kick = true; - - mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); - - target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - target_svm->sev_es.snp_ap_waiting_for_reset = true; - - /* Interrupt injection mode shouldn't change for AP creation */ - if (request < SVM_VMGEXIT_AP_DESTROY) { - u64 sev_features; - - sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; - sev_features ^= sev->vmsa_features; - - if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { - vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", - vcpu->arch.regs[VCPU_REGS_RAX]); - ret = -EINVAL; - goto out; - } - } + guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); switch (request) { case SVM_VMGEXIT_AP_CREATE_ON_INIT: - kick = false; - fallthrough; case SVM_VMGEXIT_AP_CREATE: + if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { + vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); + return -EINVAL; + } + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", svm->vmcb->control.exit_info_2); - ret = -EINVAL; - goto out; + return -EINVAL; } /* @@ -4007,30 +3968,32 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) vcpu_unimpl(vcpu, "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", svm->vmcb->control.exit_info_2); - ret = -EINVAL; - goto out; + return -EINVAL; } target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; break; case SVM_VMGEXIT_AP_DESTROY: + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; break; default: vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", request); - ret = -EINVAL; - break; + return -EINVAL; } -out: - if (kick) { + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* + * Unless Creation is deferred until INIT, signal the vCPU to update + * its state. + */ + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) { kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); kvm_vcpu_kick(target_vcpu); } - mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); - - return ret; + return 0; } static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) @@ -4069,7 +4032,8 @@ static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_ goto out_unlock; } - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err)); + /* No action is requested *from KVM* if there was a firmware error. */ + svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); ret = 1; /* resume guest */ @@ -4125,8 +4089,7 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r return snp_handle_guest_req(svm, req_gpa, resp_gpa); request_invalid: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); return 1; /* resume guest */ } @@ -4134,7 +4097,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); u64 ghcb_info; int ret = 1; @@ -4318,8 +4281,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (ret) return ret; - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); + svm_vmgexit_success(svm, 0); exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { @@ -4354,7 +4316,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); switch (control->exit_info_1) { case 0: @@ -4363,21 +4325,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); + svm_vmgexit_success(svm, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; break; } case SVM_VMGEXIT_HV_FEATURES: - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED); - + svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); ret = 1; break; case SVM_VMGEXIT_TERM_REQUEST: @@ -4398,8 +4358,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) case SVM_VMGEXIT_AP_CREATION: ret = sev_snp_ap_creation(svm); if (ret) { - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; @@ -4565,7 +4524,7 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); /* * Set the GHCB MSR value as per the GHCB specification when emulating @@ -4580,6 +4539,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -4603,14 +4564,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. */ - if (sev_vcpu_has_debug_swap(svm)) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); @@ -4635,7 +4604,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * Return from an AP Reset Hold VMGEXIT, where the guest will * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + svm_vmgexit_success(svm, 1); break; case AP_RESET_HOLD_MSR_PROTO: /* @@ -4833,7 +4802,7 @@ static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); kvm_pfn_t pfn_aligned; gfn_t gfn_aligned; int level, rc; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7640a84e554a..d5d0c5c3300b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -607,6 +607,9 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); + + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -684,6 +687,9 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + return 0; } @@ -1297,8 +1303,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1559,7 +1569,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) + if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && + static_branch_likely(&switch_vcpu_ibpb)) indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) @@ -1932,7 +1943,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1991,11 +2002,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -2973,11 +2984,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -3165,6 +3172,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * AMD changed the architectural behavior of bits 5:2. On CPUs + * without BusLockTrap, bits 5:2 control "external pins", but + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed + * the guest to set bits 5:2 despite not actually virtualizing + * Performance-Monitoring/Breakpoint external pins. Drop bits + * 5:2 for backwards compatibility. + */ + data &= ~GENMASK(5, 2); + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3272,6 +3300,17 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } @@ -3342,6 +3381,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -3504,7 +3544,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); @@ -3587,7 +3627,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -3609,6 +3649,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -4116,20 +4158,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -4189,6 +4234,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4197,6 +4254,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4231,7 +4290,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); - pre_svm_run(vcpu); + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -4247,14 +4311,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4282,6 +4354,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -5043,6 +5119,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9d7cdb8fbf87..d4490eaed55d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -361,20 +361,18 @@ static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) #ifdef CONFIG_KVM_AMD_SEV static __always_inline bool sev_guest(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; + return to_kvm_sev_info(kvm)->active; } static __always_inline bool sev_es_guest(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return sev->es_active && !WARN_ON_ONCE(!sev->active); } static __always_inline bool sev_snp_guest(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && !WARN_ON_ONCE(!sev_es_guest(kvm)); @@ -581,10 +579,39 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) return false; } +static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm, + u64 response, u64 data) +{ + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data); +} + +static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector) +{ + u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector; + + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data); +} + +static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror); +} + +static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + +static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; @@ -715,7 +742,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -void pre_sev_run(struct vcpu_svm *svm, int cpu); +int pre_sev_run(struct vcpu_svm *svm, int cpu); void sev_init_vmcb(struct vcpu_svm *svm); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 2ed80aea3bb1..0c61153b275f 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - 1: vmrun %rax - -2: cli - +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0b844cb97978..ccda95e53f62 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -830,12 +830,12 @@ TRACE_EVENT(kvm_emulate_insn, TP_ARGS(vcpu, failed), TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH ) + __field( __u8, flags ) + __field( __u8, failed ) ), TP_fast_assign( @@ -846,7 +846,7 @@ TRACE_EVENT(kvm_emulate_insn, __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt->fetch.data, - 15); + X86_MAX_INSTRUCTION_LENGTH); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2427f918e763..43ee9ed11291 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8a7af02d466e..5504d9e9fd32 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2970,7 +2970,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; @@ -3771,7 +3771,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -4618,7 +4618,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; @@ -4646,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4930,8 +4930,9 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4981,7 +4982,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -5071,7 +5073,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { if (vm_exit_reason != -1) @@ -5084,6 +5086,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } @@ -5316,9 +5329,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 2c296b6abb8c..6eedcfc91070 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification); +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len); + +static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + u32 exit_insn_len; + + if (to_vmx(vcpu)->fail || vm_exit_reason == -1 || + (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_insn_len = 0; + else + exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info, + exit_qualification, exit_insn_len); +} + void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43..51116fe69a50 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vmx(vcpu)->pi_desc); @@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); list_add_tail(&vmx->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d..5c5766467a61 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1477,7 +1477,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * performs IBPB on nested VM-Exit (a single nested transition * may switch the active VMCS multiple times). */ - if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + if (static_branch_likely(&switch_vcpu_ibpb) && + (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) indirect_branch_prediction_barrier(); } @@ -1514,16 +1515,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -2582,6 +2579,34 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } +#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ +({ \ + int i, r = 0; \ + \ + BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ + BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ + \ + for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ + typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ + typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ + \ + if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ + continue; \ + \ + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ + "entry = %llx (%llx), exit = %llx (%llx)\n", \ + (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ + (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ + \ + if (error_on_inconsistent_vmcs_config) \ + r = -EIO; \ + \ + entry_controls &= ~n_ctrl; \ + exit_controls &= ~x_ctrl; \ + } \ + r; \ +}) + static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2593,7 +2618,6 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _vmentry_control = 0; u64 basic_msr; u64 misc_msr; - int i; /* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. @@ -2697,22 +2721,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control)) return -EIO; - for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { - u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; - u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; - - if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) - continue; - - pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", - _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); - - if (error_on_inconsistent_vmcs_config) - return -EIO; - - _vmentry_control &= ~n_ctrl; - _vmexit_control &= ~x_ctrl; - } + if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, + _vmentry_control, _vmexit_control)) + return -EIO; /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they @@ -3523,7 +3534,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -5215,6 +5226,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } +static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_fpu.fpstate->xfd && + !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5241,7 +5258,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * point. */ if (is_nm_fault(intr_info)) { - kvm_queue_exception(vcpu, NM_VECTOR); + kvm_queue_exception_p(vcpu, NM_VECTOR, + is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1; } @@ -5648,6 +5666,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5815,7 +5839,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) ? PFERR_PRESENT_MASK : 0; if (error_code & EPT_VIOLATION_GVA_IS_VALID) @@ -5869,11 +5893,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } -static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +/* + * Returns true if emulation is required (due to the vCPU having invalid state + * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the + * current vCPU state. + */ +static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->emulation_required && !vmx->rmode.vm86_active && + if (!vmx->emulation_required) + return false; + + /* + * It is architecturally impossible for emulation to be required when a + * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if + * guest state is invalid and unrestricted guest is disabled, i.e. KVM + * should synthesize VM-Fail instead emulation L2 code. This path is + * only reachable if userspace modifies L2 guest state after KVM has + * performed the nested VM-Enter consistency checks. + */ + if (vmx->nested.nested_run_pending) + return true; + + /* + * KVM only supports emulating exceptions if the vCPU is in Real Mode. + * If emulation is required, KVM can't perform a successful VM-Enter to + * inject the exception. + */ + return !vmx->rmode.vm86_active && (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } @@ -5896,7 +5944,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5920,7 +5968,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -6995,16 +7043,15 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * - * Do not blindly read xfd_err here, since this exception might - * be caused by L1 interception on a platform which doesn't - * support xfd at all. - * - * Do it conditionally upon guest_fpu::xfd. xfd_err matters - * only when xfd contains a non-zero value. + * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM + * interception may have been caused by L1 interception. Per the SDM, + * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * - * Queuing exception is done in vmx_handle_exit. See comment there. + * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. + * unlike CR2 and DR6, the value is not a payload that is attached to + * the #NM exception. */ - if (vcpu->arch.guest_fpu.fpstate->xfd) + if (is_xfd_nm_fault(vcpu)) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } @@ -7155,13 +7202,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); + case INTR_TYPE_HARD_EXCEPTION: { + u32 error_code = 0; + + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(error_code_field); + + kvm_requeue_exception(vcpu, vector, + idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, + error_code); break; + } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; @@ -7417,10 +7468,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7456,8 +7503,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* @@ -8007,38 +8054,50 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) +static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; + bool imm; + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; + imm = info->src_type == OP_IMM; } else { port = info->dst_val; size = info->src_bytes; + imm = info->dst_type == OP_IMM; } - /* - * If the 'use IO bitmaps' VM-execution control is 0, IO instruction - * VM-exits depend on the 'unconditional IO exiting' VM-execution - * control. - * - * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. - */ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + *exit_qualification = ((unsigned long)port << 16) | (size - 1); + + if (info->intercept == x86_intercept_ins || + info->intercept == x86_intercept_outs) + *exit_qualification |= BIT(4); + + if (info->rep_prefix) + *exit_qualification |= BIT(5); + + if (imm) + *exit_qualification |= BIT(6); + + return nested_vmx_check_io_bitmaps(vcpu, port, size); } int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -8047,26 +8106,34 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qualification = 0; + u32 vm_exit_reason; + u64 exit_insn_len; switch (info->intercept) { - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - * Note, RDPID is hidden behind ENABLE_RDTSCP. - */ case x86_intercept_rdpid: + /* + * RDPID causes #UD if not enabled through secondary execution + * controls (ENABLE_RDTSCP). Note, the implicit MSR access to + * TSC_AUX is NOT subject to interception, i.e. checking only + * the dedicated execution control is architecturally correct. + */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } - break; + return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -8079,7 +8146,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + if (info->intercept == x86_intercept_lldt || + info->intercept == x86_intercept_ltr || + info->intercept == x86_intercept_sldt || + info->intercept == x86_intercept_str) + vm_exit_reason = EXIT_REASON_LDTR_TR; + else + vm_exit_reason = EXIT_REASON_GDTR_IDTR; + /* + * FIXME: Decode the ModR/M to generate the correct exit + * qualification for memory operands. + */ + break; + + case x86_intercept_hlt: + if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: @@ -8092,17 +8176,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || - !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: - break; + return X86EMUL_UNHANDLEABLE; } - return X86EMUL_UNHANDLEABLE; + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) + return X86EMUL_UNHANDLEABLE; + + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, + exit_insn_len); + return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8b111ce1087c..951e44dc9d0e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -340,8 +340,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 633c87e2fd92..96677576c836 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -118,7 +118,7 @@ do_exception: #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ do_exception: * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3295a67c04..430773a5ef8e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d4a6734b2d6..3712dde0bf9d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -800,9 +800,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; @@ -810,13 +810,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); /* - * If the exception is destined for L2 and isn't being reinjected, - * morph it to a VM-Exit if L1 wants to intercept the exception. A - * previously injected exception is not checked because it was checked - * when it was original queued, and re-checking is incorrect if _L1_ - * injected the exception, in which case it's exempt from interception. + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. */ - if (!reinject && is_guest_mode(vcpu) && + if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); @@ -825,28 +822,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On VM-Entry, an exception can be pending if and only - * if event injection was blocked by nested_run_pending. - * In that case, however, vcpu_enter_guest() requests an - * immediate exit, and the guest shouldn't proceed far - * enough to need reinjection. - */ - WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; @@ -887,30 +865,53 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -980,16 +981,10 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -1264,7 +1259,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } @@ -2080,10 +2075,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT)) + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) return kvm_handle_invalid_op(vcpu); +emulate_as_nop: pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } @@ -2569,6 +2574,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2626,12 +2634,18 @@ static inline bool kvm_check_tsc_unstable(void) * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, - u64 ns, bool matched) + u64 ns, bool matched, bool user_set_tsc) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (vcpu->arch.guest_tsc_protected) + return; + + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. @@ -2717,8 +2731,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) } } - if (user_value) - kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable @@ -2738,7 +2750,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) matched = true; } - __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } @@ -3116,15 +3128,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, +static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, + struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, - unsigned int offset, - bool force_tsc_unstable) + unsigned int offset) { - struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info hv_clock; unsigned long flags; + memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); + read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); @@ -3144,52 +3158,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, * it is consistent. */ - guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; + guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } + hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); - - if (force_tsc_unstable) - guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; + memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); smp_wmb(); - guest_hv_clock->version = ++vcpu->hv_clock.version; + guest_hv_clock->version = ++hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } -static int kvm_guest_time_update(struct kvm_vcpu *v) +int kvm_guest_time_update(struct kvm_vcpu *v) { + struct pvclock_vcpu_time_info hv_clock = {}; unsigned long flags, tgt_tsc_khz; unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; bool use_master_clock; -#ifdef CONFIG_KVM_XEN - /* - * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless - * explicitly told to use TSC as its clocksource Xen will not set this bit. - * This default behaviour led to bugs in some guest kernels which cause - * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. - */ - bool xen_pvclock_tsc_unstable = - ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; -#endif kernel_ns = 0; host_tsc = 0; @@ -3250,35 +3246,57 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); + &vcpu->pvclock_tsc_shift, + &vcpu->pvclock_tsc_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; - kvm_xen_update_tsc_info(v); } - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + hv_clock.tsc_shift = vcpu->pvclock_tsc_shift; + hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul; + hv_clock.tsc_timestamp = tsc_timestamp; + hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; + hv_clock.flags = 0; if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; + + if (vcpu->pv_time.active) { + /* + * GUEST_STOPPED is only supported by kvmclock, and KVM's + * historic behavior is to only process the request if kvmclock + * is active/enabled. + */ + if (vcpu->pvclock_set_guest_stopped_request) { + hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0); + + hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; + } - vcpu->hv_clock.flags = pvclock_flags; + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); - if (vcpu->pv_time.active) - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + * + * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters! + */ + if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT; + if (vcpu->xen.vcpu_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time), - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0); #endif - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -3544,7 +3562,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -3733,7 +3751,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u32 msr = msr_info->index; u64 data = msr_info->data; - if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + /* + * Do not allow host-initiated writes to trigger the Xen hypercall + * page setup; it could incur locking paths which are not expected + * if userspace sets the MSR in an unusual location. + */ + if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && + !msr_info->host_initiated) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { @@ -3889,7 +3913,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3906,7 +3930,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); - } else { + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; @@ -3924,7 +3948,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -4573,6 +4597,11 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } +static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4681,7 +4710,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; @@ -4986,7 +5015,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) @@ -5725,8 +5755,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); - kvm->arch.user_set_tsc = true; - __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); r = 0; @@ -6905,23 +6934,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; - int ret = 0; - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time.active) - continue; - - ret = kvm_set_guest_paused(vcpu); - if (ret) { - kvm_err("Failed to pause guest VCPU%d: %d\n", - vcpu->vcpu_id, ret); - break; - } - } - mutex_unlock(&kvm->lock); + /* + * Ignore the return, marking the guest paused only "fails" if the vCPU + * isn't using kvmclock; continuing on is correct and desirable. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + (void)kvm_set_guest_paused(vcpu); - return ret ? NOTIFY_BAD : NOTIFY_DONE; + return NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) @@ -10961,10 +10982,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } + vcpu->arch.host_debugctl = get_debugctlmsr(); + guest_timing_enter_irqoff(); for (;;) { @@ -11215,9 +11241,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -11294,9 +11318,8 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { if (kvm_vcpu_has_events(vcpu)) - vcpu->arch.pv.pv_unhalted = false; - else - vcpu->arch.mp_state = state; + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); return 1; } else { vcpu->run->exit_reason = reason; @@ -11469,6 +11492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u32 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); @@ -11514,8 +11538,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || - (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -11573,7 +11598,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); @@ -11761,6 +11786,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11774,6 +11801,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); @@ -11816,10 +11845,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else - vcpu->arch.mp_state = mp_state->mp_state; + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -11946,7 +11975,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -12249,9 +12278,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) @@ -12358,6 +12387,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { int idx; + kvm_clear_async_pf_completion_queue(vcpu); + kvm_mmu_unload(vcpu); + kvmclock_reset(vcpu); kvm_x86_call(vcpu_free)(vcpu); @@ -12741,6 +12773,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); } + once_init(&kvm->arch.nx_once); return 0; out_uninit_mmu: @@ -12750,37 +12783,6 @@ out: return ret; } -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - once_init(&kvm->arch.nx_once); - return 0; -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_unload_vcpu_mmus(struct kvm *kvm) -{ - unsigned long i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_unload_vcpu_mmu(vcpu); - } -} - -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_pit(kvm); -} - /** * __x86_set_memory_region: Setup KVM internal memory slot * @@ -12859,6 +12861,17 @@ EXPORT_SYMBOL_GPL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { + /* + * Stop all background workers and kthreads before destroying vCPUs, as + * iterating over vCPUs in a different task while vCPUs are being freed + * is unsafe, i.e. will lead to use-after-free. The PIT also needs to + * be stopped before IRQ routing is freed. + */ + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); + + kvm_free_pit(kvm); + kvm_mmu_pre_destroy_vm(kvm); } @@ -12878,18 +12891,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - kvm_unload_vcpu_mmus(kvm); - kvm_x86_call(vm_destroy)(kvm); + kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); + kvm_x86_call(vm_destroy)(kvm); } static void memslot_rmap_free(struct kvm_memory_slot *slot) @@ -13383,8 +13395,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (vcpu->arch.apf.send_user_only && - kvm_x86_call(get_cpl)(vcpu) == 0) + if (!vcpu->arch.apf.send_always && + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; if (is_guest_mode(vcpu)) { @@ -13474,7 +13486,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 91e50a513100..9dc32a409076 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -121,6 +121,13 @@ static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) return vcpu->arch.last_vmentry_cpu != -1; } +static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) +{ + vcpu->arch.mp_state = mp_state; + if (mp_state == KVM_MP_STATE_RUNNABLE) + vcpu->arch.pv.pv_unhalted = false; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || @@ -362,6 +369,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); +int kvm_guest_time_update(struct kvm_vcpu *v); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a909b817b9c0..38b33cdd4232 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, + struct pvclock_vcpu_time_info *hv_clock, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) +{ + unsigned long flags; + int r; + + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); + + r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); + if (r) + return r; + + read_lock_irqsave(&gpc->lock, flags); + } + + memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); + read_unlock_irqrestore(&gpc->lock, flags); + + /* + * Sanity check TSC shift+multiplier to verify the guest's view of time + * is more or less consistent. + */ + if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift || + hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul) + return -EINVAL; + + return 0; +} + static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, bool linux_wa) { + struct kvm_vcpu_xen *xen = &vcpu->arch.xen; int64_t kernel_now, delta; uint64_t guest_now; + int r = -EOPNOTSUPP; /* * The guest provides the requested timeout in absolute nanoseconds @@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, * the absolute CLOCK_MONOTONIC time at which the timer should * fire. */ - if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && - static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + do { + struct pvclock_vcpu_time_info hv_clock; uint64_t host_tsc, guest_tsc; + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) || + !vcpu->kvm->arch.use_master_clock) + break; + + /* + * If both Xen PV clocks are active, arbitrarily try to use the + * compat clock first, but also try to use the non-compat clock + * if the compat clock is unusable. The two PV clocks hold the + * same information, but it's possible one (or both) is stale + * and/or currently unreachable. + */ + if (xen->vcpu_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (r && xen->vcpu_time_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0); + if (r) + break; + if (!IS_ENABLED(CONFIG_64BIT) || !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { /* @@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, /* Calculate the guest kvmclock as the guest would do it. */ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); - guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, - guest_tsc); - } else { + guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc); + } while (0); + + if (r) { /* * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. * @@ -1280,10 +1335,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) * Note, truncation is a non-issue as 'lm' is guaranteed to be * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. */ - hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 - : kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; + hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64 + : kvm->arch.xen.hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64 + : kvm->arch.xen.hvm_config.blob_size_32; u8 *page; int ret; @@ -1324,15 +1379,24 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; + /* + * Restrict the MSR to the range that is unofficially reserved for + * synthetic, virtualization-defined MSRs, e.g. to prevent confusing + * KVM by colliding with a real MSR that requires special handling. + */ + if (xhc->msr && + (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX)) + return -EINVAL; + mutex_lock(&kvm->arch.xen.xen_lock); - if (xhc->msr && !kvm->arch.xen_hvm_config.msr) + if (xhc->msr && !kvm->arch.xen.hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); - else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) + else if (!xhc->msr && kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); - old_flags = kvm->arch.xen_hvm_config.flags; - memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); + old_flags = kvm->arch.xen.hvm_config.flags; + memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc)); mutex_unlock(&kvm->arch.xen.xen_lock); @@ -1413,7 +1477,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, int i; if (!lapic_in_kernel(vcpu) || - !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) + !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; if (IS_ENABLED(CONFIG_64BIT) && !longmode) { @@ -1480,7 +1544,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, @@ -1489,9 +1553,9 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, kvm_vcpu_halt(vcpu); if (sched_poll.timeout) - del_timer(&vcpu->arch.xen.poll_timer); + timer_delete(&vcpu->arch.xen.poll_timer); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } vcpu->arch.xen.poll_evtchn = 0; @@ -2225,8 +2289,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; + hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); @@ -2244,30 +2308,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); - del_timer_sync(&vcpu->arch.xen.poll_timer); -} - -void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *entry; - u32 function; - - if (!vcpu->arch.xen.cpuid.base) - return; - - function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); - if (function > vcpu->arch.xen.cpuid.limit) - return; - - entry = kvm_find_cpuid_entry_index(vcpu, function, 1); - if (entry) { - entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; - entry->edx = vcpu->arch.hv_clock.tsc_shift; - } - - entry = kvm_find_cpuid_entry_index(vcpu, function, 2); - if (entry) - entry->eax = vcpu->arch.hw_tsc_khz; + timer_delete_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) @@ -2291,6 +2332,6 @@ void kvm_xen_destroy_vm(struct kvm *kvm) } idr_destroy(&kvm->arch.xen.evtchn_ports); - if (kvm->arch.xen_hvm_config.msr) + if (kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index f5841d9000ae..59e6128a7bd3 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -9,6 +9,7 @@ #ifndef __ARCH_X86_KVM_XEN_H__ #define __ARCH_X86_KVM_XEN_H__ +#include <asm/xen/cpuid.h> #include <asm/xen/hypervisor.h> #ifdef CONFIG_KVM_XEN @@ -35,7 +36,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); -void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) { @@ -50,16 +50,32 @@ static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) kvm_xen_inject_vcpu_vector(vcpu); } +static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.cpuid.base && + function <= vcpu->arch.xen.cpuid.limit && + function == (vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3)); +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && - kvm->arch.xen_hvm_config.msr; + kvm->arch.xen.hvm_config.msr; +} + +static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr) +{ + if (!static_branch_unlikely(&kvm_xen_enabled.key)) + return false; + + return msr && msr == kvm->arch.xen.hvm_config.msr; } static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && - (kvm->arch.xen_hvm_config.flags & + (kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); } @@ -124,6 +140,11 @@ static inline bool kvm_xen_msr_enabled(struct kvm *kvm) return false; } +static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr) +{ + return false; +} + static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) { return false; @@ -157,8 +178,9 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) return false; } -static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function) { + return false; } #endif |