diff options
-rw-r--r-- | arch/x86/include/asm/kvm-x86-ops.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/svm.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 45 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_emulate.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/smm.c | 15 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 25 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 85 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 110 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 1 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 21 |
15 files changed, 184 insertions, 177 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 378ed944b849..3942b74c1b75 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -103,7 +103,6 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d271ba20a0b2..6b87770ea34d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1665,7 +1665,8 @@ struct kvm_x86_ops { void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, + bool force_immediate_exit); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1733,8 +1734,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* @@ -2047,7 +2046,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); @@ -2240,7 +2239,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 87a7b917d30e..728c98175b9c 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -358,10 +358,10 @@ struct sev_es_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u64 vmpl0_ssp; - u64 vmpl1_ssp; - u64 vmpl2_ssp; - u64 vmpl3_ssp; + u64 pl0_ssp; + u64 pl1_ssp; + u64 pl2_ssp; + u64 pl3_ssp; u64 u_cet; u8 reserved_0xc8[2]; u8 vmpl; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e223043ef5b2..3328c7c7f2f9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1820,22 +1820,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) return X86EMUL_CONTINUE; } -static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) +static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len) { struct segmented_address addr; - rsp_increment(ctxt, -bytes); + rsp_increment(ctxt, -len); addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; - return segmented_write(ctxt, addr, data, bytes); + return segmented_write(ctxt, addr, data, len); } static int em_push(struct x86_emulate_ctxt *ctxt) { /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return push(ctxt, &ctxt->src.val, ctxt->op_bytes); + return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1863,7 +1863,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; - unsigned long val, change_mask; + unsigned long val = 0; + unsigned long change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); @@ -1920,7 +1921,7 @@ static int em_enter(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; rbp = reg_read(ctxt, VCPU_REGS_RBP); - rc = push(ctxt, &rbp, stack_size(ctxt)); + rc = emulate_push(ctxt, &rbp, stack_size(ctxt)); if (rc != X86EMUL_CONTINUE) return rc; assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), @@ -1954,7 +1955,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt) static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; - unsigned long selector; + unsigned long selector = 0; int rc; rc = emulate_pop(ctxt, &selector, 2); @@ -2000,7 +2001,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; - u32 val; + u32 val = 0; while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { @@ -2229,7 +2230,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + unsigned long eip = 0; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2241,7 +2242,8 @@ static int em_ret(struct x86_emulate_ctxt *ctxt) static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip, cs; + unsigned long eip = 0; + unsigned long cs = 0; int cpl = ctxt->ops->cpl(ctxt); struct desc_struct new_desc; @@ -3011,7 +3013,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ret = em_push(ctxt); } - ops->get_dr(ctxt, 7, &dr7); + dr7 = ops->get_dr(ctxt, 7); ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); return ret; @@ -3184,7 +3186,7 @@ fail: static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + unsigned long eip = 0; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3866,15 +3868,6 @@ static int check_cr_access(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) -{ - unsigned long dr7; - - ctxt->ops->get_dr(ctxt, 7, &dr7); - - return dr7 & DR7_GD; -} - static int check_dr_read(struct x86_emulate_ctxt *ctxt) { int dr = ctxt->modrm_reg; @@ -3887,10 +3880,10 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); - if (check_dr7_gd(ctxt)) { + if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) { ulong dr6; - ctxt->ops->get_dr(ctxt, 6, &dr6); + dr6 = ctxt->ops->get_dr(ctxt, 6); dr6 &= ~DR_TRAP_BITS; dr6 |= DR6_BD | DR6_ACTIVE_LOW; ctxt->ops->set_dr(ctxt, 6, dr6); @@ -4505,11 +4498,11 @@ static const struct instr_dual instr_dual_0f_38_f1 = { }; static const struct gprefix three_byte_0f_38_f0 = { - ID(0, &instr_dual_0f_38_f0), N, N, N + ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N }; static const struct gprefix three_byte_0f_38_f1 = { - ID(0, &instr_dual_0f_38_f1), N, N, N + ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N }; /* @@ -5449,7 +5442,7 @@ twobyte_insn: ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); break; case 0x21: /* mov from dr to reg */ - ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); + ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg); break; case 0x40 ... 0x4f: /* cmov */ if (test_cc(ctxt->b, ctxt->eflags)) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index e6d149825169..153977c556f4 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -203,7 +203,7 @@ struct x86_emulate_ops { ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); - void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3242f3da2457..681f6d82d015 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -124,6 +124,9 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); + __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); @@ -2466,8 +2469,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu->arch.apic) + if (!vcpu->arch.apic) { + static_branch_dec(&kvm_has_noapic_vcpu); return; + } hrtimer_cancel(&apic->lapic_timer.timer); @@ -2809,6 +2814,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) ASSERT(vcpu != NULL); + if (!irqchip_in_kernel(vcpu->kvm)) { + static_branch_inc(&kvm_has_noapic_vcpu); + return 0; + } + apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; @@ -2844,6 +2854,21 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); + /* + * Defer evaluating inhibits until the vCPU is first run, as this vCPU + * will not get notified of any changes until this vCPU is visible to + * other vCPUs (marked online and added to the set of vCPUs). + * + * Opportunistically mark APICv active as VMX in particularly is highly + * unlikely to have inhibits. Ignore the current per-VM APICv state so + * that vCPU creation is guaranteed to run with a deterministic value, + * the request will ensure the vCPU gets the correct state before VM-Entry. + */ + if (enable_apicv) { + apic->apicv_active = true; + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } + return 0; nomem_free_apic: kfree(apic); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index dc3d95fdca7d..d06d43d8d2aa 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -184,7 +184,6 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, struct kvm_smram_state_32 *smram) { struct desc_ptr dt; - unsigned long val; int i; smram->cr0 = kvm_read_cr0(vcpu); @@ -195,10 +194,8 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, for (i = 0; i < 8; i++) smram->gprs[i] = kvm_register_read_raw(vcpu, i); - kvm_get_dr(vcpu, 6, &val); - smram->dr6 = (u32)val; - kvm_get_dr(vcpu, 7, &val); - smram->dr7 = (u32)val; + smram->dr6 = (u32)vcpu->arch.dr6; + smram->dr7 = (u32)vcpu->arch.dr7; enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); @@ -231,7 +228,6 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, struct kvm_smram_state_64 *smram) { struct desc_ptr dt; - unsigned long val; int i; for (i = 0; i < 16; i++) @@ -240,11 +236,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, smram->rip = kvm_rip_read(vcpu); smram->rflags = kvm_get_rflags(vcpu); - - kvm_get_dr(vcpu, 6, &val); - smram->dr6 = val; - kvm_get_dr(vcpu, 7, &val); - smram->dr7 = val; + smram->dr6 = vcpu->arch.dr6; + smram->dr7 = vcpu->arch.dr7; smram->cr0 = kvm_read_cr0(vcpu); smram->cr3 = kvm_read_cr3(vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e90b429c84f1..b9096bb79c00 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2735,7 +2735,6 @@ static int dr_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); int reg, dr; - unsigned long val; int err = 0; /* @@ -2763,11 +2762,9 @@ static int dr_interception(struct kvm_vcpu *vcpu) dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ dr -= 16; - val = kvm_register_read(vcpu, reg); - err = kvm_set_dr(vcpu, dr, val); + err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); } else { - kvm_get_dr(vcpu, dr, &val); - kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); } return kvm_complete_insn_gp(vcpu, err); @@ -4092,6 +4089,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -4115,12 +4115,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { struct vcpu_svm *svm = to_svm(vcpu); bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4139,9 +4140,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * is enough to force an immediate vmexit. */ disable_nmi_singlestep(svm); - smp_send_reschedule(vcpu->cpu); + force_immediate_exit = true; } + if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); + pre_svm_run(vcpu); sync_lapic_to_cr8(vcpu); @@ -4237,9 +4241,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - return svm_exit_handlers_fastpath(vcpu); } @@ -4997,8 +4998,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .request_immediate_exit = __kvm_request_immediate_exit, - .sched_in = svm_sched_in, .nested_ops = &svm_nested_ops, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 83843379813e..88659de4d2a7 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,20 +15,23 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), + TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit), + TP_ARGS(vcpu, force_immediate_exit), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned long, rip ) + __field( bool, immediate_exit ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; __entry->rip = kvm_rip_read(vcpu); + __entry->immediate_exit = force_immediate_exit; ), - TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) + TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, + __entry->immediate_exit ? "[immediate exit]" : "") ); /* diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6329a306856b..0cd977f22c4b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4433,7 +4433,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_dr7 = vcpu->arch.dr7; if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 88a4ff200d04..fa3c3abb50e4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -49,6 +49,8 @@ #include <asm/spec-ctrl.h> #include <asm/vmx.h> +#include <trace/events/ipi.h> + #include "capabilities.h" #include "cpuid.h" #include "hyperv.h" @@ -1290,8 +1292,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; - vmx->req_immediate_exit = false; - /* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions @@ -5575,10 +5575,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - - kvm_get_dr(vcpu, dr, &val); - kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); err = 0; } else { err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); @@ -6000,22 +5997,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->req_immediate_exit && - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { - kvm_lapic_expired_hv_timer(vcpu); + /* + * In the *extremely* unlikely scenario that this is a spurious VM-Exit + * due to the timer expiring while it was "soft" disabled, just eat the + * exit and re-enter the guest. + */ + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) return EXIT_FASTPATH_REENTER_GUEST; - } - return EXIT_FASTPATH_NONE; + /* + * If the timer expired because KVM used it to force an immediate exit, + * then mission accomplished. + */ + if (force_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + + /* + * If L2 is active, go down the slow path as emulating the guest timer + * expiration likely requires synthesizing a nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; } static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - handle_fastpath_preemption_timer(vcpu); + /* + * This non-fastpath handler is reached if and only if the preemption + * timer was being used to emulate a guest timer while L2 is active. + * All other scenarios are supposed to be handled in the fastpath. + */ + WARN_ON_ONCE(!is_guest_mode(vcpu)); + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -7155,13 +7176,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->req_immediate_exit) { + if (force_immediate_exit) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (vmx->hv_deadline_tsc != -1) { @@ -7214,13 +7235,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, barrier_nospec(); } -static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { + /* + * If L2 is active, some VMX preemption timer exits can be handled in + * the fastpath even, all other exits must use the slow path. + */ + if (is_guest_mode(vcpu) && + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + return EXIT_FASTPATH_NONE; + switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: - return handle_fastpath_preemption_timer(vcpu); + return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); default: return EXIT_FASTPATH_NONE; } @@ -7280,7 +7310,7 @@ out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7307,7 +7337,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -7366,7 +7396,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) - vmx_update_hv_timer(vcpu); + vmx_update_hv_timer(vcpu, force_immediate_exit); + else if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); kvm_wait_lapic_expire(vcpu); @@ -7430,10 +7462,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - - return vmx_exit_handlers_fastpath(vcpu); + return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } static void vmx_vcpu_free(struct kvm_vcpu *vcpu) @@ -7913,11 +7942,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->req_immediate_exit = true; -} - static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, struct x86_instruction_info *info) { @@ -8370,8 +8394,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .request_immediate_exit = vmx_request_immediate_exit, - .sched_in = vmx_sched_in, .cpu_dirty_log_size = PML_ENTITY_NUM, @@ -8631,7 +8653,6 @@ static __init int hardware_setup(void) if (!enable_preemption_timer) { vmx_x86_ops.set_hv_timer = NULL; vmx_x86_ops.cancel_hv_timer = NULL; - vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e3b0985bb74a..65786dbe7d60 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -332,8 +332,6 @@ struct vcpu_vmx { unsigned int ple_window; bool ple_window_dirty; - bool req_immediate_exit; - /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3638a104bcf7..e6b1b85dca8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) } EXPORT_SYMBOL_GPL(kvm_set_dr); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[array_index_nospec(dr, size)]; - break; + return vcpu->arch.db[array_index_nospec(dr, size)]; case 4: case 6: - *val = vcpu->arch.dr6; - break; + return vcpu->arch.dr6; case 5: default: /* 7 */ - *val = vcpu->arch.dr7; - break; + return vcpu->arch.dr7; } } EXPORT_SYMBOL_GPL(kvm_get_dr); @@ -5061,8 +5058,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) { - if (!vcpu->arch.guest_state_protected) - vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu); /* * Take the srcu lock as memslots will be accessed to check the gfn @@ -5509,18 +5505,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - unsigned long val; + unsigned int i; memset(dbgregs, 0, sizeof(*dbgregs)); - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - kvm_get_dr(vcpu, 6, &val); - dbgregs->dr6 = val; + + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + dbgregs->db[i] = vcpu->arch.db[i]; + + dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { + unsigned int i; + if (dbgregs->flags) return -EINVAL; @@ -5529,7 +5530,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + vcpu->arch.db[i] = dbgregs->db[i]; + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; @@ -8167,10 +8170,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) +static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr) { - kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return kvm_get_dr(emul_to_vcpu(ctxt), dr); } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, @@ -9635,11 +9637,13 @@ static void kvm_x86_check_cpu_compat(void *ret) *(int *)ret = kvm_x86_check_processor_compatibility(); } -static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { u64 host_pat; int r, cpu; + guard(mutex)(&vendor_module_lock); + if (kvm_x86_ops.hardware_enable) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; @@ -9769,17 +9773,6 @@ out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } - -int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) -{ - int r; - - mutex_lock(&vendor_module_lock); - r = __kvm_x86_vendor_init(ops); - mutex_unlock(&vendor_module_lock); - - return r; -} EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) @@ -10676,12 +10669,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - smp_send_reschedule(vcpu->cpu); -} -EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); - /* * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without @@ -10931,10 +10918,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - if (req_immediate_exit) { + if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_request_immediate_exit)(vcpu); - } fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -10965,7 +10950,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -12063,27 +12048,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (r < 0) return r; - if (irqchip_in_kernel(vcpu->kvm)) { - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); - if (r < 0) - goto fail_mmu_destroy; - - /* - * Defer evaluating inhibits until the vCPU is first run, as - * this vCPU will not get notified of any changes until this - * vCPU is visible to other vCPUs (marked online and added to - * the set of vCPUs). Opportunistically mark APICv active as - * VMX in particularly is highly unlikely to have inhibits. - * Ignore the current per-VM APICv state so that vCPU creation - * is guaranteed to run with a deterministic value, the request - * will ensure the vCPU gets the correct state before VM-Entry. - */ - if (enable_apicv) { - vcpu->arch.apic->apicv_active = true; - kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); - } - } else - static_branch_inc(&kvm_has_noapic_vcpu); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + if (r < 0) + goto fail_mmu_destroy; r = -ENOMEM; @@ -12204,8 +12171,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); kvfree(vcpu->arch.cpuid_entries); - if (!lapic_in_kernel(vcpu)) - static_branch_dec(&kvm_has_noapic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -12482,9 +12447,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); -EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -13087,11 +13049,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_apicv_active(vcpu) && - static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) - return true; + return kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu); +} - return false; +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; } bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) @@ -13114,9 +13078,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - if (vcpu != kvm_get_running_vcpu()) - return vcpu->arch.preempted_in_kernel; - return static_call(kvm_x86_get_cpl)(vcpu) == 0; } @@ -13911,9 +13872,6 @@ module_init(kvm_x86_init); static void __exit kvm_x86_exit(void) { - /* - * If module_init() is implemented, module_exit() must also be - * implemented to allow module unload. - */ + WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu)); } module_exit(kvm_x86_exit); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 77b9ec28c9ec..7d3385653377 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1504,6 +1504,7 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 741d86ca06a1..fb49c2a60200 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4046,6 +4046,18 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) return false; } +/* + * By default, simply query the target vCPU's current mode when checking if a + * vCPU was preempted in kernel mode. All architectures except x86 (or more + * specifical, except VMX) allow querying whether or not a vCPU is in kernel + * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel() + * directly for cross-vCPU checks is functionally correct and accurate. + */ +bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_in_kernel(vcpu); +} + bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; @@ -4082,9 +4094,16 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) continue; + + /* + * Treat the target vCPU as being in-kernel if it has a + * pending interrupt, as the vCPU trying to yield may + * be spinning waiting on IPI delivery, i.e. the target + * vCPU is in-kernel for the purposes of directed yield. + */ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_dy_has_pending_interrupt(vcpu) && - !kvm_arch_vcpu_in_kernel(vcpu)) + !kvm_arch_vcpu_preempted_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; |