summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-15 15:43:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-15 15:43:55 -0700
commit9db59599ae502b38b27cff6462273f84acd59927 (patch)
tree96d90a2f7bcddc837987579ad2d3e58b891db716
parentb38923a068c10fc36ca8f596d650d095ce390b85 (diff)
parent4f350c6dbcb9000e18907515ec8a7b205ac33c69 (diff)
downloadlwn-9db59599ae502b38b27cff6462273f84acd59927.tar.gz
lwn-9db59599ae502b38b27cff6462273f84acd59927.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more KVM updates from Paolo Bonzini: - PPC bugfixes - RCU splat fix - swait races fix - pointless userspace-triggerable BUG() fix - misc fixes for KVM_RUN corner cases - nested virt correctness fixes + one host DoS - some cleanups - clang build fix - fix AMD AVIC with default QEMU command line options - x86 bugfixes * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits) kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry kvm,mips: Fix potential swait_active() races kvm,powerpc: Serialize wq active checks in ops->vcpu_kick kvm: Serialize wq active checks in kvm_vcpu_wake_up() kvm,x86: Fix apf_task_wake_one() wq serialization kvm,lapic: Justify use of swait_active() kvm,async_pf: Use swq_has_sleeper() sched/wait: Add swq_has_sleeper() KVM: VMX: Do not BUG() on out-of-bounds guest IRQ KVM: Don't accept obviously wrong gsi values via KVM_IRQFD kvm: nVMX: Don't allow L2 to access the hardware CR8 KVM: trace events: update list of exit reasons KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously KVM: X86: Don't block vCPU if there is pending exception KVM: SVM: Add irqchip_split() checks before enabling AVIC KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv() KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu() KVM: x86: fix clang build ...
-rw-r--r--arch/mips/kvm/mips.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xive.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S17
-rw-r--r--arch/powerpc/kvm/book3s_xive.c1
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c7
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kvm/cpuid.h1
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/svm.c38
-rw-r--r--arch/x86/kvm/vmx.c162
-rw-r--r--arch/x86/kvm/x86.c51
-rw-r--r--include/linux/swait.h58
-rw-r--r--include/trace/events/kvm.h4
-rw-r--r--virt/kvm/async_pf.c6
-rw-r--r--virt/kvm/eventfd.c2
-rw-r--r--virt/kvm/kvm_main.c3
18 files changed, 257 insertions, 111 deletions
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bce2a6431430..d535edc01434 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
dvcpu->arch.wait = 0;
- if (swait_active(&dvcpu->wq))
+ if (swq_has_sleeper(&dvcpu->wq))
swake_up(&dvcpu->wq);
return 0;
@@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
kvm_mips_callbacks->queue_timer_int(vcpu);
vcpu->arch.wait = 0;
- if (swait_active(&vcpu->wq))
+ if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 18e974a34fce..73bf1ebfa78f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu);
- if (swait_active(wqp)) {
+ if (swq_has_sleeper(wqp)) {
swake_up(wqp);
++vcpu->stat.halt_wakeup;
}
@@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if ((cfg->process_table & PRTS_MASK) > 24)
return -EINVAL;
+ mutex_lock(&kvm->lock);
kvm->arch.process_table = cfg->process_table;
kvmppc_setup_partition_table(kvm);
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+ mutex_unlock(&kvm->lock);
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
index abf5f01b6eb1..5b81a807d742 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xive.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
@@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void)
#define __x_tima get_tima_phys()
#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page))
#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page))
-#define __x_readb __raw_rm_readb
#define __x_writeb __raw_rm_writeb
#define __x_readw __raw_rm_readw
#define __x_readq __raw_rm_readq
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 663a4a861e7f..17936f82d3c7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
+ /*
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ */
bl kvmppc_restore_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
@@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
+ /*
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ */
bl kvmppc_save_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
@@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
/*
* Are we running hash or radix ?
*/
- beq cr2,3f
+ ld r5, VCPU_KVM(r9)
+ lbz r0, KVM_RADIX(r5)
+ cmpwi cr2, r0, 0
+ beq cr2, 3f
/* Radix: Handle the case where the guest used an illegal PID */
LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
+ /*
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ */
ld r9, HSTATE_KVM_VCPU(r13)
bl kvmppc_save_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -2578,6 +2590,9 @@ kvm_end_cede:
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
+ /*
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ */
bl kvmppc_restore_tm
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 08b200a0bbce..13304622ab1c 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -48,7 +48,6 @@
#define __x_tima xive_tima
#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio))
#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio))
-#define __x_readb __raw_readb
#define __x_writeb __raw_writeb
#define __x_readw __raw_readw
#define __x_readq __raw_readq
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index d1ed2c41b5d2..c7a5deadd1cc 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
* bit.
*/
if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
- u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+ __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
+ u8 pipr = be64_to_cpu(qw1) & 0xff;
if (pipr >= xc->hw_cppr)
return;
}
@@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
u8 pending = xc->pending;
u32 hirq;
- u8 pipr;
pr_devel("H_IPOLL(server=%ld)\n", server);
@@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
pending = 0xff;
} else {
/* Grab pending interrupt if any */
- pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+ __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
+ u8 pipr = be64_to_cpu(qw1) & 0xff;
if (pipr < 8)
pending |= 1 << pipr;
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8844eee290b2..c73e493adf07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -951,7 +951,6 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
- u32 (*get_pkru)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -973,7 +972,7 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*get_enable_apicv)(void);
+ bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 874827b0d7ca..aa60a08b65b1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
hlist_del_init(&n->link);
if (n->halted)
smp_send_reschedule(n->cpu);
- else if (swait_active(&n->wq))
+ else if (swq_has_sleeper(&n->wq))
swake_up(&n->wq);
}
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 1ea3c0e1e3a9..0bc5c1315708 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
{
unsigned x86_leaf = x86_feature / 32;
- BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index aaf10b6f5380..69c5612be786 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic)
atomic_inc(&apic->lapic_timer.pending);
kvm_set_pending_timer(vcpu);
+ /*
+ * For x86, the atomic_inc() is serialized, thus
+ * using swait_active() is safe.
+ */
if (swait_active(q))
swake_up(q);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2c1cfe68a9af..0e68f0b3cbf7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- svm->vcpu.arch.apicv_active = true;
}
static void init_vmcb(struct vcpu_svm *svm)
@@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_PAUSE);
}
- if (avic)
+ if (kvm_vcpu_apicv_active(&svm->vcpu))
avic_init_vmcb(svm);
/*
@@ -1600,6 +1599,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
+static int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ return 0;
+
+ ret = avic_init_backing_page(&svm->vcpu);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
+ return ret;
+}
+
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
{
struct vcpu_svm *svm;
@@ -1636,14 +1652,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
if (!hsave_page)
goto free_page3;
- if (avic) {
- err = avic_init_backing_page(&svm->vcpu);
- if (err)
- goto free_page4;
-
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
- }
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto free_page4;
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
@@ -4395,9 +4406,9 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
-static bool svm_get_enable_apicv(void)
+static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
{
- return avic;
+ return avic && irqchip_split(vcpu->kvm);
}
static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
@@ -4414,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- if (!avic)
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
return;
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
@@ -5302,6 +5313,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
*/
if (info->rep_prefix != REPE_PREFIX)
goto out;
+ break;
case SVM_EXIT_IOIO: {
u64 exit_info;
u32 bytes;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 699704d4bc9e..06c0c6d0541e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ
}
}
-static bool vmx_get_enable_apicv(void)
+static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
{
return enable_apicv;
}
@@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
- vmcs_readl(EXIT_QUALIFICATION),
- vmx->idt_vectoring_info,
- intr_info,
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
- KVM_ISA_VMX);
+ if (vmx->nested.nested_run_pending)
+ return false;
+
+ if (unlikely(vmx->fail)) {
+ pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ return true;
+ }
/*
* The host physical addresses of some pages of guest memory
@@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
- if (vmx->nested.nested_run_pending)
- return false;
-
- if (unlikely(vmx->fail)) {
- pr_info_ratelimited("%s failed vm entry %x\n", __func__,
- vmcs_read32(VM_INSTRUCTION_ERROR));
- return true;
- }
+ trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+ vmcs_readl(EXIT_QUALIFICATION),
+ vmx->idt_vectoring_info,
+ intr_info,
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ KVM_ISA_VMX);
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
@@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
- vmx->loaded_vmcs->launched = 1;
-
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-
/*
* eager fpu is enabled if PKEY is supported and CR4 is switched
* back on host, so it is safe to read guest PKRU from current
@@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
vmx->nested.nested_run_pending = 0;
+ vmx->idt_vectoring_info = 0;
+
+ vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+ if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ return;
+
+ vmx->loaded_vmcs->launched = 1;
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
@@ -10525,6 +10527,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (exec_control & CPU_BASED_TPR_SHADOW) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ } else {
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
}
/*
@@ -11388,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 vm_inst_error = 0;
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ /*
+ * The only expected VM-instruction error is "VM entry with
+ * invalid control field(s)." Anything else indicates a
+ * problem with L0.
+ */
+ WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD));
+
leave_guest_mode(vcpu);
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
- if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
- vmcs12->vm_exit_msr_store_count))
- nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ if (likely(!vmx->fail)) {
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
- if (unlikely(vmx->fail))
- vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ }
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-
- /*
- * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
- * the VM-exit interrupt information (valid interrupt) is always set to
- * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
- * kvm_cpu_has_interrupt(). See the commit message for details.
- */
- if (nested_exit_intr_ack_set(vcpu) &&
- exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- kvm_cpu_has_interrupt(vcpu)) {
- int irq = kvm_cpu_get_interrupt(vcpu);
- WARN_ON(irq < 0);
- vmcs12->vm_exit_intr_info = irq |
- INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
- }
-
- trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
- vmcs12->exit_qualification,
- vmcs12->idt_vectoring_info_field,
- vmcs12->vm_exit_intr_info,
- vmcs12->vm_exit_intr_error_code,
- KVM_ISA_VMX);
-
vm_entry_controls_reset_shadow(vmx);
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
@@ -11436,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (VMCS02_POOL_SIZE == 0)
nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
- load_vmcs12_host_state(vcpu, vmcs12);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
@@ -11486,21 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- /*
- * Exiting from L2 to L1, we're now back to L1 which thinks it just
- * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
- * success or failure flag accordingly.
- */
- if (unlikely(vmx->fail)) {
- vmx->fail = 0;
- nested_vmx_failValid(vcpu, vm_inst_error);
- } else
- nested_vmx_succeed(vcpu);
if (enable_shadow_vmcs)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (likely(!vmx->fail)) {
+ /*
+ * TODO: SDM says that with acknowledge interrupt on
+ * exit, bit 31 of the VM-exit interrupt information
+ * (valid interrupt) is always set to 1 on
+ * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
+ * need kvm_cpu_has_interrupt(). See the commit
+ * message for details.
+ */
+ if (nested_exit_intr_ack_set(vcpu) &&
+ exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ kvm_cpu_has_interrupt(vcpu)) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
+ WARN_ON(irq < 0);
+ vmcs12->vm_exit_intr_info = irq |
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+ }
+
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ return;
+ }
+
+ /*
+ * After an early L2 VM-entry failure, we're now back
+ * in L1 which thinks it just finished a VMLAUNCH or
+ * VMRESUME instruction, so we need to set the failure
+ * flag and the VM-instruction error field of the VMCS
+ * accordingly.
+ */
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ /*
+ * The emulated instruction was already skipped in
+ * nested_vmx_run, but the updated RIP was never
+ * written back to the vmcs01.
+ */
+ skip_emulated_instruction(vcpu);
+ vmx->fail = 0;
}
/*
@@ -11829,7 +11854,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
- int idx, ret = -EINVAL;
+ int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11838,7 +11863,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
if (e->type != KVM_IRQ_ROUTING_MSI)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6069af86da3b..cd17b7d9a107 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7231,10 +7231,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ if (kvm_run->immediate_exit) {
+ r = -EINTR;
+ goto out;
+ }
kvm_vcpu_block(vcpu);
kvm_apic_accept_events(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
goto out;
}
@@ -7971,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
@@ -8452,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (vcpu->arch.pv.pv_unhalted)
return true;
+ if (vcpu->arch.exception.pending)
+ return true;
+
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
kvm_x86_ops->nmi_allowed(vcpu)))
@@ -8619,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
sizeof(val));
}
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+ sizeof(u32));
+}
+
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
@@ -8646,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
+ u32 val;
if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
@@ -8653,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
trace_kvm_async_pf_ready(work->arch.token, work->gva);
- if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
- !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- fault.async_page_fault = true;
- kvm_inject_page_fault(vcpu, &fault);
+ if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+ !apf_get_user(vcpu, &val)) {
+ if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+ vcpu->arch.exception.pending &&
+ vcpu->arch.exception.nr == PF_VECTOR &&
+ !apf_put_user(vcpu, 0)) {
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.nr = 0;
+ vcpu->arch.exception.has_error_code = false;
+ vcpu->arch.exception.error_code = 0;
+ } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
}
vcpu->arch.apf.halted = false;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/include/linux/swait.h b/include/linux/swait.h
index 4a4e180d0a35..73e97a08d3d0 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
DECLARE_SWAIT_QUEUE_HEAD(name)
#endif
-static inline int swait_active(struct swait_queue_head *q)
+/**
+ * swait_active -- locklessly test for waiters on the queue
+ * @wq: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * NOTE2: this function has the same above implications as regular waitqueues.
+ *
+ * Use either while holding swait_queue_head::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ * CPU0 - waker CPU1 - waiter
+ *
+ * for (;;) {
+ * @cond = true; prepare_to_swait(&wq_head, &wait, state);
+ * smp_mb(); // smp_mb() from set_current_state()
+ * if (swait_active(wq_head)) if (@cond)
+ * wake_up(wq_head); break;
+ * schedule();
+ * }
+ * finish_swait(&wq_head, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * swait_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ * This, in turn, can trigger missing wakeups.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
+static inline int swait_active(struct swait_queue_head *wq)
+{
+ return !list_empty(&wq->task_list);
+}
+
+/**
+ * swq_has_sleeper - check if there are any waiting processes
+ * @wq: the waitqueue to test for waiters
+ *
+ * Returns true if @wq has waiting processes
+ *
+ * Please refer to the comment for swait_active.
+ */
+static inline bool swq_has_sleeper(struct swait_queue_head *wq)
{
- return !list_empty(&q->task_list);
+ /*
+ * We need to be sure we are in sync with the list_add()
+ * modifications to the wait queue (task_list).
+ *
+ * This memory barrier should be paired with one on the
+ * waiting side.
+ */
+ smp_mb();
+ return swait_active(wq);
}
extern void swake_up(struct swait_queue_head *q);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 8ade3eb6c640..dcffedfac431 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -14,7 +14,9 @@
ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \
- ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH)
+ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
+ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \
+ ERSN(HYPERV)
TRACE_EVENT(kvm_userspace_exit,
TP_PROTO(__u32 reason, int errno),
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index bb298a200cd3..57bcb27dcf30 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work)
trace_kvm_async_pf_completed(addr, gva);
- /*
- * This memory barrier pairs with prepare_to_wait's set_current_state()
- */
- smp_mb();
- if (swait_active(&vcpu->wq))
+ if (swq_has_sleeper(&vcpu->wq))
swake_up(&vcpu->wq);
mmput(mm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index f2ac53ab8243..c608ab495282 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
+ if (args->gsi >= KVM_MAX_IRQ_ROUTES)
+ return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
return kvm_irqfd_deassign(kvm, args);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6ed1c2021198..9deb5a245b83 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -674,6 +674,7 @@ out_err_no_irq_srcu:
out_err_no_srcu:
hardware_disable_all();
out_err_no_disable:
+ refcount_set(&kvm->users_count, 0);
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
@@ -2186,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
struct swait_queue_head *wqp;
wqp = kvm_arch_vcpu_wq(vcpu);
- if (swait_active(wqp)) {
+ if (swq_has_sleeper(wqp)) {
swake_up(wqp);
++vcpu->stat.halt_wakeup;
return true;