summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-07-30 11:19:08 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-07-30 11:19:08 -0700
commit98a05fe8cd5e0afe2b4c52b5013b53c44d615148 (patch)
tree4dc81ee43b45b9c0463fe0b1f92c4fbf06accc8a
parentc959e90094d6db8ee1bfbe1a9c571fbd35d4daac (diff)
parent5a7591176c47cce363c1eed704241e5d1c42c5a6 (diff)
downloadlwn-98a05fe8cd5e0afe2b4c52b5013b53c44d615148.tar.gz
lwn-98a05fe8cd5e0afe2b4c52b5013b53c44d615148.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "x86: - Do not register IRQ bypass consumer if posted interrupts not supported - Fix missed device interrupt due to non-atomic update of IRR - Use GFP_KERNEL_ACCOUNT for pid_table in ipiv - Make VMREAD error path play nice with noinstr - x86: Acquire SRCU read lock when handling fastpath MSR writes - Support linking rseq tests statically against glibc 2.35+ - Fix reference count for stats file descriptors - Detect userspace setting invalid CR0 Non-KVM: - Remove coccinelle script that has caused multiple confusion ("debugfs, coccinelle: check for obsolete DEFINE_SIMPLE_ATTRIBUTE() usage", acked by Greg)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (21 commits) KVM: selftests: Expand x86's sregs test to cover illegal CR0 values KVM: VMX: Don't fudge CR0 and CR4 for restricted L2 guest KVM: x86: Disallow KVM_SET_SREGS{2} if incoming CR0 is invalid Revert "debugfs, coccinelle: check for obsolete DEFINE_SIMPLE_ATTRIBUTE() usage" KVM: selftests: Verify stats fd is usable after VM fd has been closed KVM: selftests: Verify stats fd can be dup()'d and read KVM: selftests: Verify userspace can create "redundant" binary stats files KVM: selftests: Explicitly free vcpus array in binary stats test KVM: selftests: Clean up stats fd in common stats_test() helper KVM: selftests: Use pread() to read binary stats header KVM: Grab a reference to KVM for VM and vCPU stats file descriptors selftests/rseq: Play nice with binaries statically linked against glibc 2.35+ Revert "KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid" KVM: x86: Acquire SRCU read lock when handling fastpath MSR writes KVM: VMX: Use vmread_error() to report VM-Fail in "goto" path KVM: VMX: Make VMREAD error path play nice with noinstr KVM: x86/irq: Conditionally register IRQ bypass consumer again KVM: X86: Use GFP_KERNEL_ACCOUNT for pid_table in ipiv KVM: x86: check the kvm_cpu_get_interrupt result before using it KVM: x86: VMX: set irr_pending in kvm_apic_update_irr ...
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/kvm/lapic.c25
-rw-r--r--arch/x86/kvm/svm/svm.c16
-rw-r--r--arch/x86/kvm/vmx/vmenter.S8
-rw-r--r--arch/x86/kvm/vmx/vmx.c62
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h12
-rw-r--r--arch/x86/kvm/x86.c50
-rw-r--r--scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci68
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util_base.h6
-rw-r--r--tools/testing/selftests/kvm/kvm_binary_stats_test.c68
-rw-r--r--tools/testing/selftests/kvm/x86_64/set_sregs_test.c70
-rw-r--r--tools/testing/selftests/rseq/rseq.c28
-rw-r--r--virt/kvm/kvm_main.c24
14 files changed, 255 insertions, 186 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 13bc212cd4bc..e3054e3e46d5 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -37,6 +37,7 @@ KVM_X86_OP(get_segment)
KVM_X86_OP(get_cpl)
KVM_X86_OP(set_segment)
KVM_X86_OP(get_cs_db_l_bits)
+KVM_X86_OP(is_valid_cr0)
KVM_X86_OP(set_cr0)
KVM_X86_OP_OPTIONAL(post_set_cr3)
KVM_X86_OP(is_valid_cr4)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28bd38303d70..3bc146dfd38d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1566,9 +1566,10 @@ struct kvm_x86_ops {
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 113ca9661ab2..a983a16163b1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -637,16 +637,22 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
*max_irr = -1;
for (i = vec = 0; i <= 7; i++, vec += 32) {
+ u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+
+ irr_val = *p_irr;
pir_val = READ_ONCE(pir[i]);
- irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
+
if (pir_val) {
+ pir_val = xchg(&pir[i], 0);
+
prev_irr_val = irr_val;
- irr_val |= xchg(&pir[i], 0);
- *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
- if (prev_irr_val != irr_val) {
- max_updated_irr =
- __fls(irr_val ^ prev_irr_val) + vec;
- }
+ do {
+ irr_val = prev_irr_val | pir_val;
+ } while (prev_irr_val != irr_val &&
+ !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+
+ if (prev_irr_val != irr_val)
+ max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
}
if (irr_val)
*max_irr = __fls(irr_val) + vec;
@@ -660,8 +666,11 @@ EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
- return __kvm_apic_update_irr(pir, apic->regs, max_irr);
+ if (unlikely(!apic->apicv_active && irr_updated))
+ apic->irr_pending = true;
+ return irr_updated;
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d381ad424554..956726d867aa 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1786,6 +1786,11 @@ static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
}
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3986,14 +3991,8 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
-
- /*
- * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
- * can't read guest memory (dereference memslots) to decode the WRMSR.
- */
- if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
- nrips && control->next_rip)
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
return EXIT_FASTPATH_NONE;
@@ -4815,6 +4814,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = svm_get_cs_db_l_bits,
+ .is_valid_cr0 = svm_is_valid_cr0,
.set_cr0 = svm_set_cr0,
.post_set_cr3 = sev_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 07e927d4d099..be275a0410a8 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -303,10 +303,8 @@ SYM_FUNC_START(vmx_do_nmi_irqoff)
VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
SYM_FUNC_END(vmx_do_nmi_irqoff)
-
-.section .text, "ax"
-
#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
* @field: VMCS field encoding that failed
@@ -335,7 +333,7 @@ SYM_FUNC_START(vmread_error_trampoline)
mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2
mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1
- call vmread_error
+ call vmread_error_trampoline2
/* Zero out @fault, which will be popped into the result register. */
_ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
@@ -357,6 +355,8 @@ SYM_FUNC_START(vmread_error_trampoline)
SYM_FUNC_END(vmread_error_trampoline)
#endif
+.section .text, "ax"
+
SYM_FUNC_START(vmx_do_interrupt_irqoff)
VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0ecf4be2c6af..df461f387e20 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -441,13 +441,23 @@ do { \
pr_warn_ratelimited(fmt); \
} while (0)
-void vmread_error(unsigned long field, bool fault)
+noinline void vmread_error(unsigned long field)
{
- if (fault)
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
kvm_spurious_fault();
- else
- vmx_insn_failed("vmread failed: field=%lx\n", field);
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
}
+#endif
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
@@ -1503,6 +1513,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
@@ -3037,6 +3052,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
@@ -3226,6 +3250,17 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
+static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
+
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
+
+ return true;
+}
+
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3235,7 +3270,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@ -3263,7 +3298,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ if (enable_ept && !enable_unrestricted_guest) {
/*
* Ensure KVM has an up-to-date snapshot of the guest's CR3. If
* the below code _enables_ CR3 exiting, vmx_cache_reg() will
@@ -3394,7 +3429,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
* this bit, even if host CR4.MCE == 0.
*/
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@ -3414,7 +3449,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!is_unrestricted_guest(vcpu)) {
+ if (!enable_unrestricted_guest) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
@@ -4651,7 +4686,8 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
if (kvm_vmx->pid_table)
return 0;
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm));
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
if (!pages)
return -ENOMEM;
@@ -5364,18 +5400,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_guest_cr0_valid(vcpu, val))
- return 1;
-
if (kvm_set_cr0(vcpu, val))
return 1;
vmcs_writel(CR0_READ_SHADOW, orig_val);
return 0;
} else {
- if (to_vmx(vcpu)->nested.vmxon &&
- !nested_host_cr0_valid(vcpu, val))
- return 1;
-
return kvm_set_cr0(vcpu, val);
}
}
@@ -8203,6 +8232,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
.set_cr0 = vmx_set_cr0,
.is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index ce47dc265f89..33af7b4c6eb4 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -10,7 +10,7 @@
#include "vmcs.h"
#include "../x86.h"
-void vmread_error(unsigned long field, bool fault);
+void vmread_error(unsigned long field);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
@@ -31,6 +31,13 @@ void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
* void vmread_error_trampoline(unsigned long field, bool fault);
*/
extern unsigned long vmread_error_trampoline;
+
+/*
+ * The second VMREAD error trampoline, called from the assembly trampoline,
+ * exists primarily to enable instrumentation for the VM-Fail path.
+ */
+void vmread_error_trampoline2(unsigned long field, bool fault);
+
#endif
static __always_inline void vmcs_check16(unsigned long field)
@@ -101,8 +108,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
do_fail:
instrumentation_begin();
- WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
- pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ vmread_error(field);
instrumentation_end();
return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a6b9bea62fb8..278dbd37dab2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -906,6 +906,22 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
}
EXPORT_SYMBOL_GPL(load_pdptrs);
+static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return false;
+#endif
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return false;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return false;
+
+ return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+}
+
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
/*
@@ -952,20 +968,13 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- cr0 |= X86_CR0_ET;
-
-#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL)
+ if (!kvm_is_valid_cr0(vcpu, cr0))
return 1;
-#endif
- cr0 &= ~CR0_RESERVED_BITS;
-
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
- return 1;
+ cr0 |= X86_CR0_ET;
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
- return 1;
+ /* Write to CR0 reserved bits are ignored, even on Intel. */
+ cr0 &= ~CR0_RESERVED_BITS;
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
@@ -2172,6 +2181,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
u64 data;
fastpath_t ret = EXIT_FASTPATH_NONE;
+ kvm_vcpu_srcu_read_lock(vcpu);
+
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
data = kvm_read_edx_eax(vcpu);
@@ -2194,6 +2205,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
if (ret != EXIT_FASTPATH_NONE)
trace_kvm_msr_write(msr, data);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
@@ -10203,9 +10216,13 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
if (r < 0)
goto out;
if (r) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- static_call(kvm_x86_inject_irq)(vcpu, false);
- WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ int irq = kvm_cpu_get_interrupt(vcpu);
+
+ if (!WARN_ON_ONCE(irq == -1)) {
+ kvm_queue_interrupt(vcpu, irq, false);
+ static_call(kvm_x86_inject_irq)(vcpu, false);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ }
}
if (kvm_cpu_has_injectable_intr(vcpu))
static_call(kvm_x86_enable_irq_window)(vcpu);
@@ -11460,7 +11477,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return false;
}
- return kvm_is_valid_cr4(vcpu, sregs->cr4);
+ return kvm_is_valid_cr4(vcpu, sregs->cr4) &&
+ kvm_is_valid_cr0(vcpu, sregs->cr0);
}
static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
@@ -13185,7 +13203,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
bool kvm_arch_has_irq_bypass(void)
{
- return true;
+ return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
diff --git a/scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci b/scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci
deleted file mode 100644
index 7c312310547c..000000000000
--- a/scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/// Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE
-/// for debugfs files.
-///
-//# Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file()
-//# imposes some significant overhead as compared to
-//# DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe().
-//
-// Copyright (C): 2016 Nicolai Stange
-// Options: --no-includes
-//
-
-virtual context
-virtual patch
-virtual org
-virtual report
-
-@dsa@
-declarer name DEFINE_SIMPLE_ATTRIBUTE;
-identifier dsa_fops;
-expression dsa_get, dsa_set, dsa_fmt;
-position p;
-@@
-DEFINE_SIMPLE_ATTRIBUTE@p(dsa_fops, dsa_get, dsa_set, dsa_fmt);
-
-@dcf@
-expression name, mode, parent, data;
-identifier dsa.dsa_fops;
-@@
-debugfs_create_file(name, mode, parent, data, &dsa_fops)
-
-
-@context_dsa depends on context && dcf@
-declarer name DEFINE_DEBUGFS_ATTRIBUTE;
-identifier dsa.dsa_fops;
-expression dsa.dsa_get, dsa.dsa_set, dsa.dsa_fmt;
-@@
-* DEFINE_SIMPLE_ATTRIBUTE(dsa_fops, dsa_get, dsa_set, dsa_fmt);
-
-
-@patch_dcf depends on patch expression@
-expression name, mode, parent, data;
-identifier dsa.dsa_fops;
-@@
-- debugfs_create_file(name, mode, parent, data, &dsa_fops)
-+ debugfs_create_file_unsafe(name, mode, parent, data, &dsa_fops)
-
-@patch_dsa depends on patch_dcf && patch@
-identifier dsa.dsa_fops;
-expression dsa.dsa_get, dsa.dsa_set, dsa.dsa_fmt;
-@@
-- DEFINE_SIMPLE_ATTRIBUTE(dsa_fops, dsa_get, dsa_set, dsa_fmt);
-+ DEFINE_DEBUGFS_ATTRIBUTE(dsa_fops, dsa_get, dsa_set, dsa_fmt);
-
-
-@script:python depends on org && dcf@
-fops << dsa.dsa_fops;
-p << dsa.p;
-@@
-msg="%s should be defined with DEFINE_DEBUGFS_ATTRIBUTE" % (fops)
-coccilib.org.print_todo(p[0], msg)
-
-@script:python depends on report && dcf@
-fops << dsa.dsa_fops;
-p << dsa.p;
-@@
-msg="WARNING: %s should be defined with DEFINE_DEBUGFS_ATTRIBUTE" % (fops)
-coccilib.report.print_report(p[0], msg)
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 07732a157ccd..eb1ff597bcca 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -362,8 +362,10 @@ static inline void read_stats_header(int stats_fd, struct kvm_stats_header *head
{
ssize_t ret;
- ret = read(stats_fd, header, sizeof(*header));
- TEST_ASSERT(ret == sizeof(*header), "Read stats header");
+ ret = pread(stats_fd, header, sizeof(*header), 0);
+ TEST_ASSERT(ret == sizeof(*header),
+ "Failed to read '%lu' header bytes, ret = '%ld'",
+ sizeof(*header), ret);
}
struct kvm_stats_desc *read_stats_descriptors(int stats_fd,
diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
index a7001e29dc06..698c1cfa3111 100644
--- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c
+++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c
@@ -43,8 +43,10 @@ static void stats_test(int stats_fd)
id = malloc(header.name_size);
TEST_ASSERT(id, "Allocate memory for id string");
- ret = read(stats_fd, id, header.name_size);
- TEST_ASSERT(ret == header.name_size, "Read id string");
+ ret = pread(stats_fd, id, header.name_size, sizeof(header));
+ TEST_ASSERT(ret == header.name_size,
+ "Expected header size '%u', read '%lu' bytes",
+ header.name_size, ret);
/* Check id string, that should start with "kvm" */
TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header.name_size,
@@ -165,23 +167,7 @@ static void stats_test(int stats_fd)
free(stats_data);
free(stats_desc);
free(id);
-}
-
-
-static void vm_stats_test(struct kvm_vm *vm)
-{
- int stats_fd = vm_get_stats_fd(vm);
-
- stats_test(stats_fd);
- close(stats_fd);
- TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed");
-}
-
-static void vcpu_stats_test(struct kvm_vcpu *vcpu)
-{
- int stats_fd = vcpu_get_stats_fd(vcpu);
- stats_test(stats_fd);
close(stats_fd);
TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed");
}
@@ -199,6 +185,7 @@ static void vcpu_stats_test(struct kvm_vcpu *vcpu)
int main(int argc, char *argv[])
{
+ int vm_stats_fds, *vcpu_stats_fds;
int i, j;
struct kvm_vcpu **vcpus;
struct kvm_vm **vms;
@@ -231,23 +218,58 @@ int main(int argc, char *argv[])
vcpus = malloc(sizeof(struct kvm_vcpu *) * max_vm * max_vcpu);
TEST_ASSERT(vcpus, "Allocate memory for storing vCPU pointers");
+ /*
+ * Not per-VM as the array is populated, used, and invalidated within a
+ * single for-loop iteration.
+ */
+ vcpu_stats_fds = calloc(max_vm, sizeof(*vcpu_stats_fds));
+ TEST_ASSERT(vcpu_stats_fds, "Allocate memory for VM stats fds");
+
for (i = 0; i < max_vm; ++i) {
vms[i] = vm_create_barebones();
for (j = 0; j < max_vcpu; ++j)
vcpus[i * max_vcpu + j] = __vm_vcpu_add(vms[i], j);
}
- /* Check stats read for every VM and VCPU */
+ /*
+ * Check stats read for every VM and vCPU, with a variety of flavors.
+ * Note, stats_test() closes the passed in stats fd.
+ */
for (i = 0; i < max_vm; ++i) {
- vm_stats_test(vms[i]);
+ /*
+ * Verify that creating multiple userspace references to a
+ * single stats file works and doesn't cause explosions.
+ */
+ vm_stats_fds = vm_get_stats_fd(vms[i]);
+ stats_test(dup(vm_stats_fds));
+
+ /* Verify userspace can instantiate multiple stats files. */
+ stats_test(vm_get_stats_fd(vms[i]));
+
+ for (j = 0; j < max_vcpu; ++j) {
+ vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]);
+ stats_test(dup(vcpu_stats_fds[j]));
+ stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j]));
+ }
+
+ /*
+ * Close the VM fd and redo the stats tests. KVM should gift a
+ * reference (to the VM) to each stats fd, i.e. stats should
+ * still be accessible even after userspace has put its last
+ * _direct_ reference to the VM.
+ */
+ kvm_vm_free(vms[i]);
+
+ stats_test(vm_stats_fds);
for (j = 0; j < max_vcpu; ++j)
- vcpu_stats_test(vcpus[i * max_vcpu + j]);
+ stats_test(vcpu_stats_fds[j]);
+
ksft_test_result_pass("vm%i\n", i);
}
- for (i = 0; i < max_vm; ++i)
- kvm_vm_free(vms[i]);
free(vms);
+ free(vcpus);
+ free(vcpu_stats_fds);
ksft_finished(); /* Print results and exit() accordingly */
}
diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
index a284fcef6ed7..3610981d9162 100644
--- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
@@ -22,26 +22,25 @@
#include "kvm_util.h"
#include "processor.h"
-static void test_cr4_feature_bit(struct kvm_vcpu *vcpu, struct kvm_sregs *orig,
- uint64_t feature_bit)
-{
- struct kvm_sregs sregs;
- int rc;
-
- /* Skip the sub-test, the feature is supported. */
- if (orig->cr4 & feature_bit)
- return;
-
- memcpy(&sregs, orig, sizeof(sregs));
- sregs.cr4 |= feature_bit;
-
- rc = _vcpu_sregs_set(vcpu, &sregs);
- TEST_ASSERT(rc, "KVM allowed unsupported CR4 bit (0x%lx)", feature_bit);
-
- /* Sanity check that KVM didn't change anything. */
- vcpu_sregs_get(vcpu, &sregs);
- TEST_ASSERT(!memcmp(&sregs, orig, sizeof(sregs)), "KVM modified sregs");
-}
+#define TEST_INVALID_CR_BIT(vcpu, cr, orig, bit) \
+do { \
+ struct kvm_sregs new; \
+ int rc; \
+ \
+ /* Skip the sub-test, the feature/bit is supported. */ \
+ if (orig.cr & bit) \
+ break; \
+ \
+ memcpy(&new, &orig, sizeof(sregs)); \
+ new.cr |= bit; \
+ \
+ rc = _vcpu_sregs_set(vcpu, &new); \
+ TEST_ASSERT(rc, "KVM allowed invalid " #cr " bit (0x%lx)", bit); \
+ \
+ /* Sanity check that KVM didn't change anything. */ \
+ vcpu_sregs_get(vcpu, &new); \
+ TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs"); \
+} while (0)
static uint64_t calc_supported_cr4_feature_bits(void)
{
@@ -80,7 +79,7 @@ int main(int argc, char *argv[])
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
uint64_t cr4;
- int rc;
+ int rc, i;
/*
* Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
@@ -92,6 +91,7 @@ int main(int argc, char *argv[])
vcpu_sregs_get(vcpu, &sregs);
+ sregs.cr0 = 0;
sregs.cr4 |= calc_supported_cr4_feature_bits();
cr4 = sregs.cr4;
@@ -103,16 +103,24 @@ int main(int argc, char *argv[])
sregs.cr4, cr4);
/* Verify all unsupported features are rejected by KVM. */
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_UMIP);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_LA57);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_VMXE);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMXE);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_FSGSBASE);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_PCIDE);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_OSXSAVE);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMEP);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_SMAP);
- test_cr4_feature_bit(vcpu, &sregs, X86_CR4_PKE);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMXE);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_FSGSBASE);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PCIDE);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_OSXSAVE);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMEP);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMAP);
+ TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PKE);
+
+ for (i = 32; i < 64; i++)
+ TEST_INVALID_CR_BIT(vcpu, cr0, sregs, BIT(i));
+
+ /* NW without CD is illegal, as is PG without PE. */
+ TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW);
+ TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG);
+
kvm_vm_free(vm);
/* Create a "real" VM and verify APIC_BASE can be set. */
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 4e4aa006004c..a723da253244 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -34,9 +34,17 @@
#include "../kselftest.h"
#include "rseq.h"
-static const ptrdiff_t *libc_rseq_offset_p;
-static const unsigned int *libc_rseq_size_p;
-static const unsigned int *libc_rseq_flags_p;
+/*
+ * Define weak versions to play nice with binaries that are statically linked
+ * against a libc that doesn't support registering its own rseq.
+ */
+__weak ptrdiff_t __rseq_offset;
+__weak unsigned int __rseq_size;
+__weak unsigned int __rseq_flags;
+
+static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
+static const unsigned int *libc_rseq_size_p = &__rseq_size;
+static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
/* Offset from the thread pointer to the rseq area. */
ptrdiff_t rseq_offset;
@@ -155,9 +163,17 @@ unsigned int get_rseq_feature_size(void)
static __attribute__((constructor))
void rseq_init(void)
{
- libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
- libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
- libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
+ /*
+ * If the libc's registered rseq size isn't already valid, it may be
+ * because the binary is dynamically linked and not necessarily due to
+ * libc not having registered a restartable sequence. Try to find the
+ * symbols if that's the case.
+ */
+ if (!*libc_rseq_size_p) {
+ libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
+ libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
+ libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
+ }
if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
*libc_rseq_size_p != 0) {
/* rseq registration owned by glibc */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dfbaafbe3a00..5bbb5612b207 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4035,8 +4035,17 @@ static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
sizeof(vcpu->stat), user_buffer, size, offset);
}
+static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
+{
+ struct kvm_vcpu *vcpu = file->private_data;
+
+ kvm_put_kvm(vcpu->kvm);
+ return 0;
+}
+
static const struct file_operations kvm_vcpu_stats_fops = {
.read = kvm_vcpu_stats_read,
+ .release = kvm_vcpu_stats_release,
.llseek = noop_llseek,
};
@@ -4057,6 +4066,9 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
put_unused_fd(fd);
return PTR_ERR(file);
}
+
+ kvm_get_kvm(vcpu->kvm);
+
file->f_mode |= FMODE_PREAD;
fd_install(fd, file);
@@ -4701,8 +4713,17 @@ static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
sizeof(kvm->stat), user_buffer, size, offset);
}
+static int kvm_vm_stats_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = file->private_data;
+
+ kvm_put_kvm(kvm);
+ return 0;
+}
+
static const struct file_operations kvm_vm_stats_fops = {
.read = kvm_vm_stats_read,
+ .release = kvm_vm_stats_release,
.llseek = noop_llseek,
};
@@ -4721,6 +4742,9 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
put_unused_fd(fd);
return PTR_ERR(file);
}
+
+ kvm_get_kvm(kvm);
+
file->f_mode |= FMODE_PREAD;
fd_install(fd, file);