diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-15 15:00:28 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-15 15:00:28 -0700 |
commit | 636deed6c0bc137a7c4f4a97ae1fcf0ad75323da (patch) | |
tree | 7bd27189b8e30e3c1466f7730831a08db65f8646 /arch | |
parent | aa2e3ac64ace127f403be85aa4d6015b859385f2 (diff) | |
parent | 4a605bc08e98381d8df61c30a4acb2eac15eb7da (diff) | |
download | lwn-636deed6c0bc137a7c4f4a97ae1fcf0ad75323da.tar.gz lwn-636deed6c0bc137a7c4f4a97ae1fcf0ad75323da.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini:
"ARM:
- some cleanups
- direct physical timer assignment
- cache sanitization for 32-bit guests
s390:
- interrupt cleanup
- introduction of the Guest Information Block
- preparation for processor subfunctions in cpu models
PPC:
- bug fixes and improvements, especially related to machine checks
and protection keys
x86:
- many, many cleanups, including removing a bunch of MMU code for
unnecessary optimizations
- AVIC fixes
Generic:
- memcg accounting"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (147 commits)
kvm: vmx: fix formatting of a comment
KVM: doc: Document the life cycle of a VM and its resources
MAINTAINERS: Add KVM selftests to existing KVM entry
Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()"
KVM: PPC: Book3S: Add count cache flush parameters to kvmppc_get_cpu_char()
KVM: PPC: Fix compilation when KVM is not enabled
KVM: Minor cleanups for kvm_main.c
KVM: s390: add debug logging for cpu model subfunctions
KVM: s390: implement subfunction processor calls
arm64: KVM: Fix architecturally invalid reset value for FPEXC32_EL2
KVM: arm/arm64: Remove unused timer variable
KVM: PPC: Book3S: Improve KVM reference counting
KVM: PPC: Book3S HV: Fix build failure without IOMMU support
Revert "KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()"
x86: kvmguest: use TSC clocksource if invariant TSC is exposed
KVM: Never start grow vCPU halt_poll_ns from value below halt_poll_ns_grow_start
KVM: Expose the initial start value in grow_halt_poll_ns() as a module parameter
KVM: grow_halt_poll_ns() should never shrink vCPU halt_poll_ns
KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and kvm_mmu_zap_mmio_sptes()
KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children
...
Diffstat (limited to 'arch')
69 files changed, 1718 insertions, 825 deletions
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index f6f485f4744e..d15b8c99f1b3 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -55,7 +55,7 @@ #define ICH_VTR __ACCESS_CP15(c12, 4, c11, 1) #define ICH_MISR __ACCESS_CP15(c12, 4, c11, 2) #define ICH_EISR __ACCESS_CP15(c12, 4, c11, 3) -#define ICH_ELSR __ACCESS_CP15(c12, 4, c11, 5) +#define ICH_ELRSR __ACCESS_CP15(c12, 4, c11, 5) #define ICH_VMCR __ACCESS_CP15(c12, 4, c11, 7) #define __LR0(x) __ACCESS_CP15(c12, 4, c12, x) @@ -152,7 +152,7 @@ CPUIF_MAP(ICH_HCR, ICH_HCR_EL2) CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) -CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2) +CPUIF_MAP(ICH_ELRSR, ICH_ELRSR_EL2) CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 77121b713bef..8927cae7c966 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -265,6 +265,14 @@ static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) } } +static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) { return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 50e89869178a..770d73257ad9 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> #include <asm/fpstate.h> +#include <asm/smp_plat.h> #include <kvm/arm_arch_timer.h> #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -57,10 +58,13 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_reset_coprocs(struct kvm_vcpu *vcpu); -struct kvm_arch { - /* VTTBR value associated with below pgd and vmid */ - u64 vttbr; +struct kvm_vmid { + /* The VMID generation used for the virt. memory system */ + u64 vmid_gen; + u32 vmid; +}; +struct kvm_arch { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -70,11 +74,11 @@ struct kvm_arch { */ /* The VMID generation used for the virt. memory system */ - u64 vmid_gen; - u32 vmid; + struct kvm_vmid vmid; /* Stage-2 page table */ pgd_t *pgd; + phys_addr_t pgd_phys; /* Interrupt controller */ struct vgic_dist vgic; @@ -148,6 +152,13 @@ struct kvm_cpu_context { typedef struct kvm_cpu_context kvm_cpu_context_t; +static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, + int cpu) +{ + /* The host's MPIDR is immutable, so let's set it up at boot time */ + cpu_ctxt->cp15[c0_MPIDR] = cpu_logical_map(cpu); +} + struct vcpu_reset_state { unsigned long pc; unsigned long r0; @@ -224,7 +235,35 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); -unsigned long kvm_call_hyp(void *hypfn, ...); + +unsigned long __kvm_call_hyp(void *hypfn, ...); + +/* + * The has_vhe() part doesn't get emitted, but is used for type-checking. + */ +#define kvm_call_hyp(f, ...) \ + do { \ + if (has_vhe()) { \ + f(__VA_ARGS__); \ + } else { \ + __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ + } \ + } while(0) + +#define kvm_call_hyp_ret(f, ...) \ + ({ \ + typeof(f(__VA_ARGS__)) ret; \ + \ + if (has_vhe()) { \ + ret = f(__VA_ARGS__); \ + } else { \ + ret = __kvm_call_hyp(kvm_ksym_ref(f), \ + ##__VA_ARGS__); \ + } \ + \ + ret; \ + }) + void force_vm_exit(const cpumask_t *mask); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); @@ -275,7 +314,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, * compliant with the PCS!). */ - kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); + __kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } static inline void __cpu_init_stage2(void) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index e93a0cac9add..87bcd18df8d5 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -40,6 +40,7 @@ #define TTBR1 __ACCESS_CP15_64(1, c2) #define VTTBR __ACCESS_CP15_64(6, c2) #define PAR __ACCESS_CP15_64(0, c7) +#define CNTP_CVAL __ACCESS_CP15_64(2, c14) #define CNTV_CVAL __ACCESS_CP15_64(3, c14) #define CNTVOFF __ACCESS_CP15_64(4, c14) @@ -85,6 +86,7 @@ #define TID_PRIV __ACCESS_CP15(c13, 0, c0, 4) #define HTPIDR __ACCESS_CP15(c13, 4, c0, 2) #define CNTKCTL __ACCESS_CP15(c14, 0, c1, 0) +#define CNTP_CTL __ACCESS_CP15(c14, 0, c2, 1) #define CNTV_CTL __ACCESS_CP15(c14, 0, c3, 1) #define CNTHCTL __ACCESS_CP15(c14, 4, c1, 0) @@ -94,6 +96,8 @@ #define read_sysreg_el0(r) read_sysreg(r##_el0) #define write_sysreg_el0(v, r) write_sysreg(v, r##_el0) +#define cntp_ctl_el0 CNTP_CTL +#define cntp_cval_el0 CNTP_CVAL #define cntv_ctl_el0 CNTV_CTL #define cntv_cval_el0 CNTV_CVAL #define cntvoff_el2 CNTVOFF diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3a875fc1b63c..2de96a180166 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -421,9 +421,14 @@ static inline int hyp_map_aux_data(void) static inline void kvm_set_ipa_limit(void) {} -static inline bool kvm_cpu_has_cnp(void) +static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) { - return false; + struct kvm_vmid *vmid = &kvm->arch.vmid; + u64 vmid_field, baddr; + + baddr = kvm->arch.pgd_phys; + vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + return kvm_phys_to_vttbr(baddr) | vmid_field; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 48de846f2246..531e59f5be9c 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -8,9 +8,8 @@ ifeq ($(plus_virt),+virt) plus_virt_def := -DREQUIRES_VIRT=1 endif -ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic -CFLAGS_arm.o := -I. $(plus_virt_def) -CFLAGS_mmu.o := -I. +ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic +CFLAGS_arm.o := $(plus_virt_def) AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index e8bd288fd5be..14915c78bd99 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -293,15 +293,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu, const struct coproc_params *p, const struct coproc_reg *r) { - u64 now = kvm_phys_timer_read(); - u64 val; + u32 val; if (p->is_write) { val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_TVAL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); - *vcpu_reg(vcpu, p->Rt1) = val - now; + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_TVAL); + *vcpu_reg(vcpu, p->Rt1) = val; } return true; @@ -315,9 +316,11 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu, if (p->is_write) { val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CTL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CTL); *vcpu_reg(vcpu, p->Rt1) = val; } @@ -333,9 +336,11 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, if (p->is_write) { val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; val |= *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val); + kvm_arm_timer_write_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CVAL, val); } else { - val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + val = kvm_arm_timer_read_sysreg(vcpu, + TIMER_PTIMER, TIMER_REG_CVAL); *vcpu_reg(vcpu, p->Rt1) = val; *vcpu_reg(vcpu, p->Rt2) = val >> 32; } diff --git a/arch/arm/kvm/hyp/cp15-sr.c b/arch/arm/kvm/hyp/cp15-sr.c index c4782812714c..8bf895ec6e04 100644 --- a/arch/arm/kvm/hyp/cp15-sr.c +++ b/arch/arm/kvm/hyp/cp15-sr.c @@ -27,7 +27,6 @@ static u64 *cp15_64(struct kvm_cpu_context *ctxt, int idx) void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) { - ctxt->cp15[c0_MPIDR] = read_sysreg(VMPIDR); ctxt->cp15[c0_CSSELR] = read_sysreg(CSSELR); ctxt->cp15[c1_SCTLR] = read_sysreg(SCTLR); ctxt->cp15[c1_CPACR] = read_sysreg(CPACR); diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S index aa3f9a9837ac..6ed3cf23fe89 100644 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ b/arch/arm/kvm/hyp/hyp-entry.S @@ -176,7 +176,7 @@ THUMB( orr lr, lr, #PSR_T_BIT ) msr spsr_cxsf, lr ldr lr, =panic msr ELR_hyp, lr - ldr lr, =kvm_call_hyp + ldr lr, =__kvm_call_hyp clrex eret ENDPROC(__hyp_do_panic) diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index acf1c37fa49c..3b058a5d7c5f 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -77,7 +77,7 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); write_sysreg(vcpu->arch.midr, VPIDR); } diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index c0edd450e104..8e4afba73635 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -41,7 +41,7 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); isb(); write_sysreg(0, TLBIALLIS); @@ -61,7 +61,7 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); /* Switch to requested VMID */ - write_sysreg(kvm->arch.vttbr, VTTBR); + write_sysreg(kvm_get_vttbr(kvm), VTTBR); isb(); write_sysreg(0, TLBIALL); diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 80a1d6cd261c..a08e6419ebe9 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -42,7 +42,7 @@ * r12: caller save * rest: callee save */ -ENTRY(kvm_call_hyp) +ENTRY(__kvm_call_hyp) hvc #0 bx lr -ENDPROC(kvm_call_hyp) +ENDPROC(__kvm_call_hyp) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 506386a3edde..d3842791e1c4 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -77,6 +77,10 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) */ if (!vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 |= HCR_TID3; + + if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) || + vcpu_el1_is_32bit(vcpu)) + vcpu->arch.hcr_el2 |= HCR_TID2; } static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) @@ -331,6 +335,14 @@ static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) return ESR_ELx_SYS64_ISS_RT(esr); } +static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 222af1d2c3e4..a01fe087e022 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -31,6 +31,7 @@ #include <asm/kvm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> +#include <asm/smp_plat.h> #include <asm/thread_info.h> #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -58,16 +59,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); -struct kvm_arch { +struct kvm_vmid { /* The VMID generation used for the virt. memory system */ u64 vmid_gen; u32 vmid; +}; + +struct kvm_arch { + struct kvm_vmid vmid; /* stage2 entry level table */ pgd_t *pgd; + phys_addr_t pgd_phys; - /* VTTBR value associated with above pgd and vmid */ - u64 vttbr; /* VTCR_EL2 value for this VM */ u64 vtcr; @@ -382,7 +386,36 @@ void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); u64 __kvm_call_hyp(void *hypfn, ...); -#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) + +/* + * The couple of isb() below are there to guarantee the same behaviour + * on VHE as on !VHE, where the eret to EL1 acts as a context + * synchronization event. + */ +#define kvm_call_hyp(f, ...) \ + do { \ + if (has_vhe()) { \ + f(__VA_ARGS__); \ + isb(); \ + } else { \ + __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ + } \ + } while(0) + +#define kvm_call_hyp_ret(f, ...) \ + ({ \ + typeof(f(__VA_ARGS__)) ret; \ + \ + if (has_vhe()) { \ + ret = f(__VA_ARGS__); \ + isb(); \ + } else { \ + ret = __kvm_call_hyp(kvm_ksym_ref(f), \ + ##__VA_ARGS__); \ + } \ + \ + ret; \ + }) void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); @@ -401,6 +434,13 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); +static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, + int cpu) +{ + /* The host's MPIDR is immutable, so let's set it up at boot time */ + cpu_ctxt->sys_regs[MPIDR_EL1] = cpu_logical_map(cpu); +} + void __kvm_enable_ssbs(void); static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index a80a7ef57325..4da765f2cca5 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -21,6 +21,7 @@ #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/alternative.h> +#include <asm/kvm_mmu.h> #include <asm/sysreg.h> #define __hyp_text __section(.hyp.text) notrace @@ -163,7 +164,7 @@ void __noreturn __hyp_do_panic(unsigned long, ...); static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) { write_sysreg(kvm->arch.vtcr, vtcr_el2); - write_sysreg(kvm->arch.vttbr, vttbr_el2); + write_sysreg(kvm_get_vttbr(kvm), vttbr_el2); /* * ARM erratum 1165522 requires the actual execution of the above diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 8af4b1befa42..b0742a16c6c9 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -138,7 +138,8 @@ static inline unsigned long __kern_hyp_va(unsigned long v) }) /* - * We currently only support a 40bit IPA. + * We currently support using a VM-specified IPA size. For backward + * compatibility, the default IPA size is fixed to 40bits. */ #define KVM_PHYS_SHIFT (40) @@ -591,9 +592,15 @@ static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); } -static inline bool kvm_cpu_has_cnp(void) +static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) { - return system_supports_cnp(); + struct kvm_vmid *vmid = &kvm->arch.vmid; + u64 vmid_field, baddr; + u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; + + baddr = kvm->arch.pgd_phys; + vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 72dc4c011014..5b267dec6194 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -361,6 +361,7 @@ #define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) +#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) #define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) @@ -392,6 +393,10 @@ #define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) #define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) +#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) +#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) +#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) + #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) #define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n)) @@ -426,7 +431,7 @@ #define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) #define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) -#define SYS_ICH_ELSR_EL2 sys_reg(3, 4, 12, 11, 5) +#define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) #define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) #define __SYS__LR0_EL2(x) sys_reg(3, 4, 12, 12, x) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 0f2a135ba15b..690e033a91c0 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -3,9 +3,7 @@ # Makefile for Kernel-based Virtual Machine module # -ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic -CFLAGS_arm.o := -I. -CFLAGS_mmu.o := -I. +ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic KVM=../../../virt/kvm diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index f39801e4136c..fd917d6d12af 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -76,7 +76,7 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) void kvm_arm_init_debug(void) { - __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); + __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); } /** diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 952f6cb9cf72..2845aa680841 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -40,9 +40,6 @@ * arch/arm64/kernel/hyp_stub.S. */ ENTRY(__kvm_call_hyp) -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN hvc #0 ret -alternative_else_nop_endif - b __vhe_hyp_call ENDPROC(__kvm_call_hyp) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 73c1b483ec39..2b1e686772bf 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -43,18 +43,6 @@ ldr lr, [sp], #16 .endm -ENTRY(__vhe_hyp_call) - do_el2_call - /* - * We used to rely on having an exception return to get - * an implicit isb. In the E2H case, we don't have it anymore. - * rather than changing all the leaf functions, just do it here - * before returning to the rest of the kernel. - */ - isb - ret -ENDPROC(__vhe_hyp_call) - el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index b426e2cf973c..c52a8451637c 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -53,7 +53,6 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { - ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2); ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr); ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c936aa40c3f4..539feecda5b8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -982,6 +982,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2); + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ @@ -1003,44 +1007,38 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } -static bool access_cntp_tval(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) +static bool access_arch_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) { - u64 now = kvm_phys_timer_read(); - u64 cval; + enum kvm_arch_timers tmr; + enum kvm_arch_timer_regs treg; + u64 reg = reg_to_encoding(r); - if (p->is_write) { - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, - p->regval + now); - } else { - cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); - p->regval = cval - now; + switch (reg) { + case SYS_CNTP_TVAL_EL0: + case SYS_AARCH32_CNTP_TVAL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + case SYS_AARCH32_CNTP_CTL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + case SYS_AARCH32_CNTP_CVAL: + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + default: + BUG(); } - return true; -} - -static bool access_cntp_ctl(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - if (p->is_write) - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval); - else - p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); - - return true; -} - -static bool access_cntp_cval(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ if (p->is_write) - kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval); + kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval); else - p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg); return true; } @@ -1160,6 +1158,64 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(rd, uaddr, true); } +static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0); + return true; +} + +static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + p->regval = read_sysreg(clidr_el1); + return true; +} + +static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + vcpu_write_sys_reg(vcpu, p->regval, r->reg); + else + p->regval = vcpu_read_sys_reg(vcpu, r->reg); + return true; +} + +static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 csselr; + + if (p->is_write) + return write_to_read_only(vcpu, p, r); + + csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1); + p->regval = get_ccsidr(csselr); + + /* + * Guests should not be doing cache operations by set/way at all, and + * for this reason, we trap them and attempt to infer the intent, so + * that we can flush the entire guest's address space at the appropriate + * time. + * To prevent this trapping from causing performance problems, let's + * expose the geometry of all data and unified caches (which are + * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way. + * [If guests should attempt to infer aliasing properties from the + * geometry (which is not permitted by the architecture), they would + * only do so for virtually indexed caches.] + */ + if (!(csselr & 1)) // data or unified cache + p->regval &= ~GENMASK(27, 3); + return true; +} + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1377,7 +1433,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, - { SYS_DESC(SYS_CSSELR_EL1), NULL, reset_unknown, CSSELR_EL1 }, + { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr }, + { SYS_DESC(SYS_CLIDR_EL1), access_clidr }, + { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, + { SYS_DESC(SYS_CTR_EL0), access_ctr }, { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, }, { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, @@ -1400,9 +1459,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, - { SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval }, - { SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl }, - { SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval }, + { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), @@ -1476,7 +1535,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, - { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x70 }, + { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, }; static bool trap_dbgidr(struct kvm_vcpu *vcpu, @@ -1677,6 +1736,7 @@ static const struct sys_reg_desc cp14_64_regs[] = { * register). */ static const struct sys_reg_desc cp15_regs[] = { + { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr }, { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, @@ -1723,10 +1783,9 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, - /* CNTP_TVAL */ - { Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval }, - /* CNTP_CTL */ - { Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl }, + /* Arch Tmers */ + { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer }, /* PMEVCNTRn */ PMU_PMEVCNTR(0), @@ -1794,6 +1853,10 @@ static const struct sys_reg_desc cp15_regs[] = { PMU_PMEVTYPER(30), /* PMCCFILTR */ { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper }, + + { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr }, + { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr }, + { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, c0_CSSELR }, }; static const struct sys_reg_desc cp15_64_regs[] = { @@ -1803,7 +1866,7 @@ static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ - { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval }, + { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, }; /* Target specific emulation tables */ @@ -1832,30 +1895,19 @@ static const struct sys_reg_desc *get_target_table(unsigned target, } } -#define reg_to_match_value(x) \ - ({ \ - unsigned long val; \ - val = (x)->Op0 << 14; \ - val |= (x)->Op1 << 11; \ - val |= (x)->CRn << 7; \ - val |= (x)->CRm << 3; \ - val |= (x)->Op2; \ - val; \ - }) - static int match_sys_reg(const void *key, const void *elt) { const unsigned long pval = (unsigned long)key; const struct sys_reg_desc *r = elt; - return pval - reg_to_match_value(r); + return pval - reg_to_encoding(r); } static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], unsigned int num) { - unsigned long pval = reg_to_match_value(params); + unsigned long pval = reg_to_encoding(params); return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } @@ -2218,11 +2270,15 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, } FUNCTION_INVARIANT(midr_el1) -FUNCTION_INVARIANT(ctr_el0) FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(clidr_el1) FUNCTION_INVARIANT(aidr_el1) +static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) +{ + ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); +} + /* ->val is filled in by kvm_sys_reg_table_init() */ static struct sys_reg_desc invariant_sys_regs[] = { { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index d2abd98471e8..41204a49cf95 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -1134,7 +1134,7 @@ static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f98f00da2ea..e6b5bb012ccb 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -99,6 +99,8 @@ struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; + ulong num_2M_pages; + ulong num_1G_pages; }; struct kvm_vcpu_stat { @@ -377,6 +379,7 @@ struct kvmppc_mmu { void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs); u64 (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr); u64 (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr); + int (*slbfee)(struct kvm_vcpu *vcpu, gva_t eaddr, ulong *ret_slb); void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr); void (*slbia)(struct kvm_vcpu *vcpu); /* book3s */ @@ -837,7 +840,7 @@ struct kvm_vcpu_arch { static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a6c8548ed9fa..ac22b28ae78d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -36,6 +36,8 @@ #endif #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include <asm/paca.h> +#include <asm/xive.h> +#include <asm/cpu_has_feature.h> #endif /* @@ -617,6 +619,18 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ +#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER) +static inline bool xics_on_xive(void) +{ + return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool xics_on_xive(void) +{ + return false; +} +#endif + /* * Prototypes for functions called only from assembler code. * Having prototypes reduces sparse errors. diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 8c876c166ef2..26ca425f4c2c 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -463,10 +463,12 @@ struct kvm_ppc_cpu_char { #define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58) #define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57) #define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56) +#define KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST (1ull << 54) #define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63) #define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62) #define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61) +#define KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE (1ull << 58) /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 9a7dadbe1f17..10c5579d20ce 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -39,6 +39,7 @@ #include "book3s.h" #include "trace.h" +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU /* #define EXIT_DEBUG */ @@ -71,6 +72,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, + { "largepages_2M", VM_STAT(num_2M_pages) }, + { "largepages_1G", VM_STAT(num_1G_pages) }, { NULL } }; @@ -642,7 +645,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, r = -ENXIO; break; } - if (xive_enabled()) + if (xics_on_xive()) *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu)); else *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); @@ -715,7 +718,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, r = -ENXIO; break; } - if (xive_enabled()) + if (xics_on_xive()) r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val)); else r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); @@ -991,7 +994,7 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { - if (xive_enabled()) + if (xics_on_xive()) return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level, line_status); else @@ -1044,7 +1047,7 @@ static int kvmppc_book3s_init(void) #ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XIVE - if (xive_enabled()) { + if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); } else @@ -1057,7 +1060,7 @@ static int kvmppc_book3s_init(void) static void kvmppc_book3s_exit(void) { #ifdef CONFIG_KVM_XICS - if (xive_enabled()) + if (xics_on_xive()) kvmppc_xive_exit_module(); #endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 612169988a3d..6f789f674048 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -425,6 +425,7 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) mmu->slbmte = NULL; mmu->slbmfee = NULL; mmu->slbmfev = NULL; + mmu->slbfee = NULL; mmu->slbie = NULL; mmu->slbia = NULL; } diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index c92dd25bed23..d4b967f0e8d4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -435,6 +435,19 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT); } +static int kvmppc_mmu_book3s_64_slbfee(struct kvm_vcpu *vcpu, gva_t eaddr, + ulong *ret_slb) +{ + struct kvmppc_slb *slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); + + if (slbe) { + *ret_slb = slbe->origv; + return 0; + } + *ret_slb = 0; + return -ENOENT; +} + static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) { struct kvmppc_slb *slbe; @@ -670,6 +683,7 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) mmu->slbmte = kvmppc_mmu_book3s_64_slbmte; mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee; mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev; + mmu->slbfee = kvmppc_mmu_book3s_64_slbfee; mmu->slbie = kvmppc_mmu_book3s_64_slbie; mmu->slbia = kvmppc_mmu_book3s_64_slbia; mmu->xlate = kvmppc_mmu_book3s_64_xlate; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bd2dcfbf00cd..be7bc070eae5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -442,6 +442,24 @@ int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, u32 last_inst; /* + * Fast path - check if the guest physical address corresponds to a + * device on the FAST_MMIO_BUS, if so we can avoid loading the + * instruction all together, then we can just handle it and return. + */ + if (is_store) { + int idx, ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0, + NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + if (!ret) { + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + return RESUME_GUEST; + } + } + + /* * If we fail, we just return to the guest and try executing it again. */ if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 1b821c6efdef..f55ef071883f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -403,8 +403,13 @@ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, if (!memslot) return; } - if (shift) + if (shift) { /* 1GB or 2MB page */ page_size = 1ul << shift; + if (shift == PMD_SHIFT) + kvm->stat.num_2M_pages--; + else if (shift == PUD_SHIFT) + kvm->stat.num_1G_pages--; + } gpa &= ~(page_size - 1); hpa = old & PTE_RPN_MASK; @@ -878,6 +883,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, put_page(page); } + /* Increment number of large pages if we (successfully) inserted one */ + if (!ret) { + if (level == 1) + kvm->stat.num_2M_pages++; + else if (level == 2) + kvm->stat.num_1G_pages++; + } + return ret; } diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 532ab79734c7..f02b04973710 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -133,7 +133,6 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, continue; kref_put(&stit->kref, kvm_spapr_tce_liobn_put); - return; } } } @@ -338,14 +337,15 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, } } + kvm_get_kvm(kvm); if (!ret) ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, stt, O_RDWR | O_CLOEXEC); - if (ret >= 0) { + if (ret >= 0) list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); - kvm_get_kvm(kvm); - } + else + kvm_put_kvm(kvm); mutex_unlock(&kvm->lock); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 8c7e933e942e..6ef7c5f00a49 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -47,6 +47,7 @@ #define OP_31_XOP_SLBMFEV 851 #define OP_31_XOP_EIOIO 854 #define OP_31_XOP_SLBMFEE 915 +#define OP_31_XOP_SLBFEE 979 #define OP_31_XOP_TBEGIN 654 #define OP_31_XOP_TABORT 910 @@ -416,6 +417,23 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmu.slbia(vcpu); break; + case OP_31_XOP_SLBFEE: + if (!(inst & 1) || !vcpu->arch.mmu.slbfee) { + return EMULATE_FAIL; + } else { + ulong b, t; + ulong cr = kvmppc_get_cr(vcpu) & ~CR0_MASK; + + b = kvmppc_get_gpr(vcpu, rb); + if (!vcpu->arch.mmu.slbfee(vcpu, b, &t)) + cr |= 2 << CR0_SHIFT; + kvmppc_set_gpr(vcpu, rt, t); + /* copy XER[SO] bit to CR0[SO] */ + cr |= (vcpu->arch.regs.xer & 0x80000000) >> + (31 - CR0_SHIFT); + kvmppc_set_cr(vcpu, cr); + } + break; case OP_31_XOP_SLBMFEE: if (!vcpu->arch.mmu.slbmfee) { emulated = EMULATE_FAIL; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a3d5318f5d1e..06964350b97a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -922,7 +922,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_IPOLL: case H_XIRR_X: if (kvmppc_xics_enabled(vcpu)) { - if (xive_enabled()) { + if (xics_on_xive()) { ret = H_NOT_AVAILABLE; return RESUME_GUEST; } @@ -937,6 +937,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5)); break; +#ifdef CONFIG_SPAPR_TCE_IOMMU case H_GET_TCE: ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5)); @@ -966,6 +967,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (ret == H_TOO_HARD) return RESUME_HOST; break; +#endif case H_RANDOM: if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) ret = H_HARDWARE; @@ -1445,7 +1447,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; r = RESUME_GUEST; - if (!xive_enabled()) + if (!xics_on_xive()) kvmppc_xics_rm_complete(vcpu, 0); break; default: @@ -3648,11 +3650,12 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, static void grow_halt_poll_ns(struct kvmppc_vcore *vc) { - /* 10us base */ - if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) - vc->halt_poll_ns = 10000; - else - vc->halt_poll_ns *= halt_poll_ns_grow; + if (!halt_poll_ns_grow) + return; + + vc->halt_poll_ns *= halt_poll_ns_grow; + if (vc->halt_poll_ns < halt_poll_ns_grow_start) + vc->halt_poll_ns = halt_poll_ns_grow_start; } static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) @@ -3666,7 +3669,7 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) #ifdef CONFIG_KVM_XICS static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) { - if (!xive_enabled()) + if (!xics_on_xive()) return false; return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < vcpu->arch.xive_saved_state.cppr; @@ -4226,7 +4229,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&kvm->srcu, srcu_idx); } else if (r == RESUME_PASSTHROUGH) { - if (WARN_ON(xive_enabled())) + if (WARN_ON(xics_on_xive())) r = H_SUCCESS; else r = kvmppc_xics_rm_complete(vcpu, 0); @@ -4750,7 +4753,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * If xive is enabled, we route 0x500 interrupts directly * to the guest. */ - if (xive_enabled()) + if (xics_on_xive()) lpcr |= LPCR_LPES; } @@ -4986,7 +4989,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (i == pimap->n_mapped) pimap->n_mapped++; - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); else kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); @@ -5027,7 +5030,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) return -ENODEV; } - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); else kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); @@ -5359,13 +5362,11 @@ static int kvm_init_subcore_bitmap(void) continue; sibling_subcore_state = - kmalloc_node(sizeof(struct sibling_subcore_state), + kzalloc_node(sizeof(struct sibling_subcore_state), GFP_KERNEL, node); if (!sibling_subcore_state) return -ENOMEM; - memset(sibling_subcore_state, 0, - sizeof(struct sibling_subcore_state)); for (j = 0; j < threads_per_core; j++) { int cpu = first_cpu + j; @@ -5406,7 +5407,7 @@ static int kvmppc_book3s_init_hv(void) * indirectly, via OPAL. */ #ifdef CONFIG_SMP - if (!xive_enabled() && !kvmhv_on_pseries() && + if (!xics_on_xive() && !kvmhv_on_pseries() && !local_paca->kvm_hstate.xics_phys) { struct device_node *np; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a71e2fc00a4e..b0cf22477e87 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -257,7 +257,7 @@ void kvmhv_rm_send_ipi(int cpu) } /* We should never reach this */ - if (WARN_ON_ONCE(xive_enabled())) + if (WARN_ON_ONCE(xics_on_xive())) return; /* Else poke the target with an IPI */ @@ -577,7 +577,7 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_xirr(vcpu); if (unlikely(!__xive_vm_h_xirr)) @@ -592,7 +592,7 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; vcpu->arch.regs.gpr[5] = get_tb(); - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_xirr(vcpu); if (unlikely(!__xive_vm_h_xirr)) @@ -606,7 +606,7 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_ipoll(vcpu, server); if (unlikely(!__xive_vm_h_ipoll)) @@ -621,7 +621,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_ipi(vcpu, server, mfrr); if (unlikely(!__xive_vm_h_ipi)) @@ -635,7 +635,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_cppr(vcpu, cppr); if (unlikely(!__xive_vm_h_cppr)) @@ -649,7 +649,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { if (!kvmppc_xics_enabled(vcpu)) return H_TOO_HARD; - if (xive_enabled()) { + if (xics_on_xive()) { if (is_rm()) return xive_rm_h_eoi(vcpu, xirr); if (unlikely(!__xive_vm_h_eoi)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index b3f5786b20dc..3b9662a4207e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -144,6 +144,13 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } + if (xive_enabled() && kvmhv_on_pseries()) { + /* No XICS access or hypercalls available, too hard */ + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* * Check if the core is loaded, * if not, find an available host core to post to wake the VCPU, diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 25043b50cb30..3a5e719ef032 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2272,8 +2272,13 @@ hcall_real_table: .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table +#ifdef CONFIG_SPAPR_TCE_IOMMU .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table +#else + .long 0 /* 0x1c */ + .long 0 /* 0x20 */ +#endif .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long 0 /* 0x2c */ @@ -2351,8 +2356,13 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table +#ifdef CONFIG_SPAPR_TCE_IOMMU .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table +#else + .long 0 /* 0x138 */ + .long 0 /* 0x13c */ +#endif .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 2d3b2b1cc272..4e178c4c1ea5 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -33,7 +33,7 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) server = be32_to_cpu(args->args[1]); priority = be32_to_cpu(args->args[2]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority); else rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); @@ -56,7 +56,7 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); server = priority = 0; - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority); else rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); @@ -83,7 +83,7 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_int_off(vcpu->kvm, irq); else rc = kvmppc_xics_int_off(vcpu->kvm, irq); @@ -105,7 +105,7 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) irq = be32_to_cpu(args->args[0]); - if (xive_enabled()) + if (xics_on_xive()) rc = kvmppc_xive_int_on(vcpu->kvm, irq); else rc = kvmppc_xics_int_on(vcpu->kvm, irq); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b90a7d154180..8885377ec3e0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -748,7 +748,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); break; case KVMPPC_IRQ_XICS: - if (xive_enabled()) + if (xics_on_xive()) kvmppc_xive_cleanup_vcpu(vcpu); else kvmppc_xics_free_icp(vcpu); @@ -1931,7 +1931,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EPERM; dev = kvm_device_from_filp(f.file); if (dev) { - if (xive_enabled()) + if (xics_on_xive()) r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]); else r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); @@ -2189,10 +2189,12 @@ static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | KVM_PPC_CPU_CHAR_BR_HINT_HONOURED | KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF | - KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS | + KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | - KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR | + KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; } return 0; } @@ -2251,12 +2253,16 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) if (have_fw_feat(fw_features, "enabled", "fw-count-cache-disabled")) cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + if (have_fw_feat(fw_features, "enabled", + "fw-count-cache-flush-bcctr2,0,0")) + cp->character |= KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | - KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS | + KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST; if (have_fw_feat(fw_features, "enabled", "speculation-policy-favor-security")) @@ -2267,9 +2273,13 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) if (!have_fw_feat(fw_features, "disabled", "needs-spec-barrier-for-bound-checks")) cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + if (have_fw_feat(fw_features, "enabled", + "needs-count-cache-flush-on-context-switch")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | - KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR | + KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE; of_node_put(fw_features); } diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 225667652069..1727180e8ca1 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -331,5 +331,6 @@ extern void css_schedule_reprobe(void); /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); +int chsc_sgib(u32 origin); #endif diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 2f7f27e5493f..afaf5e3c57fd 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -62,6 +62,7 @@ enum interruption_class { IRQIO_MSI, IRQIO_VIR, IRQIO_VAI, + IRQIO_GAL, NMI_NMI, CPU_RST, NR_ARCH_IRQS diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h index 6cb9e2ed05b6..b2cc1ec78d06 100644 --- a/arch/s390/include/asm/isc.h +++ b/arch/s390/include/asm/isc.h @@ -21,6 +21,7 @@ /* Adapter interrupts. */ #define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ #define PCI_ISC 2 /* PCI I/O subchannels */ +#define GAL_ISC 5 /* GIB alert */ #define AP_ISC 6 /* adjunct processor (crypto) devices */ /* Functions for registration of I/O interruption subclasses */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d5d24889c3bc..c47e22bba87f 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -591,7 +591,6 @@ struct kvm_s390_float_interrupt { struct kvm_s390_mchk_info mchk; struct kvm_s390_ext_info srv_signal; int next_rr_cpu; - unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; struct mutex ais_lock; u8 simm; u8 nimm; @@ -712,6 +711,7 @@ struct s390_io_adapter { struct kvm_s390_cpu_model { /* facility mask supported by kvm & hosting machine */ __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; + struct kvm_s390_vm_cpu_subfunc subfuncs; /* facility list requested by guest (in dma page) */ __u64 *fac_list; u64 cpuid; @@ -782,9 +782,21 @@ struct kvm_s390_gisa { u8 reserved03[11]; u32 airq_count; } g1; + struct { + u64 word[4]; + } u64; }; }; +struct kvm_s390_gib { + u32 alert_list_origin; + u32 reserved01; + u8:5; + u8 nisc:3; + u8 reserved03[3]; + u32 reserved04[5]; +}; + /* * sie_page2 has to be allocated as DMA because fac_list, crycb and * gisa need 31bit addresses in the sie control block. @@ -793,7 +805,8 @@ struct sie_page2 { __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ struct kvm_s390_crypto_cb crycb; /* 0x0800 */ struct kvm_s390_gisa gisa; /* 0x0900 */ - u8 reserved920[0x1000 - 0x920]; /* 0x0920 */ + struct kvm *kvm; /* 0x0920 */ + u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ }; struct kvm_s390_vsie { @@ -804,6 +817,20 @@ struct kvm_s390_vsie { struct page *pages[KVM_MAX_VCPUS]; }; +struct kvm_s390_gisa_iam { + u8 mask; + spinlock_t ref_lock; + u32 ref_count[MAX_ISC + 1]; +}; + +struct kvm_s390_gisa_interrupt { + struct kvm_s390_gisa *origin; + struct kvm_s390_gisa_iam alert; + struct hrtimer timer; + u64 expires; + DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); +}; + struct kvm_arch{ void *sca; int use_esca; @@ -837,7 +864,8 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); - struct kvm_s390_gisa *gisa; + DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); + struct kvm_s390_gisa_interrupt gisa_int; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -871,6 +899,9 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; +extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); +extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); + static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_check_processor_compat(void *rtn) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} @@ -878,7 +909,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} -static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0e8d68bac82c..0cd5a5f96729 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -88,6 +88,7 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"}, {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fcb55b02990e..82162867f378 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -7,6 +7,9 @@ * Author(s): Carsten Otte <cotte@de.ibm.com> */ +#define KMSG_COMPONENT "kvm-s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/hrtimer.h> @@ -23,6 +26,7 @@ #include <asm/gmap.h> #include <asm/switch_to.h> #include <asm/nmi.h> +#include <asm/airq.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -31,6 +35,8 @@ #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 +static struct kvm_s390_gib *gib; + /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { @@ -217,22 +223,100 @@ static inline u8 int_word_to_isc(u32 int_word) */ #define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) -static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +/** + * gisa_set_iam - change the GISA interruption alert mask + * + * @gisa: gisa to operate on + * @iam: new IAM value to use + * + * Change the IAM atomically with the next alert address and the IPM + * of the GISA if the GISA is not part of the GIB alert list. All three + * fields are located in the first long word of the GISA. + * + * Returns: 0 on success + * -EBUSY in case the gisa is part of the alert list + */ +static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + if ((u64)gisa != word >> 32) + return -EBUSY; + _word = (word & ~0xffUL) | iam; + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + + return 0; +} + +/** + * gisa_clear_ipm - clear the GISA interruption pending mask + * + * @gisa: gisa to operate on + * + * Clear the IPM atomically with the next alert address and the IAM + * of the GISA unconditionally. All three fields are located in the + * first long word of the GISA. + */ +static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) +{ + u64 word, _word; + + do { + word = READ_ONCE(gisa->u64.word[0]); + _word = word & ~(0xffUL << 24); + } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); +} + +/** + * gisa_get_ipm_or_restore_iam - return IPM or restore GISA IAM + * + * @gi: gisa interrupt struct to work on + * + * Atomically restores the interruption alert mask if none of the + * relevant ISCs are pending and return the IPM. + * + * Returns: the relevant pending ISCs + */ +static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) +{ + u8 pending_mask, alert_mask; + u64 word, _word; + + do { + word = READ_ONCE(gi->origin->u64.word[0]); + alert_mask = READ_ONCE(gi->alert.mask); + pending_mask = (u8)(word >> 24) & alert_mask; + if (pending_mask) + return pending_mask; + _word = (word & ~0xffUL) | alert_mask; + } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + + return 0; +} + +static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) +{ + return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; +} + +static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa) +static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) { return READ_ONCE(gisa->ipm); } -static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) +static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } @@ -245,8 +329,13 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) { - return pending_irqs_no_gisa(vcpu) | - kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; + unsigned long pending_mask; + + pending_mask = pending_irqs_no_gisa(vcpu); + if (gi->origin) + pending_mask |= gisa_get_ipm(gi->origin) << IRQ_PEND_IO_ISC_7; + return pending_mask; } static inline int isc_to_irq_type(unsigned long isc) @@ -318,13 +407,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -345,7 +434,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK)) return; - else if (psw_ioint_disabled(vcpu)) + if (psw_ioint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); else vcpu->arch.sie_block->lctl |= LCTL_CR6; @@ -353,7 +442,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_EXT_MASK)) return; if (psw_extint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); @@ -363,7 +452,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_MCHK_MASK)) return; if (psw_mchk_disabled(vcpu)) vcpu->arch.sie_block->ictl |= ICTL_LPSW; @@ -956,6 +1045,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, { struct list_head *isc_list; struct kvm_s390_float_interrupt *fi; + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti = NULL; struct kvm_s390_io_info io; u32 isc; @@ -998,8 +1088,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, goto out; } - if (vcpu->kvm->arch.gisa && - kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) { + if (gi->origin && gisa_tac_ipm_gisc(gi->origin, isc)) { /* * in case an adapter interrupt was not delivered * in SIE context KVM will handle the delivery @@ -1089,6 +1178,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) { + struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int; u64 sltime; vcpu->stat.exit_wait_state++; @@ -1102,6 +1192,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; /* disabled wait */ } + if (gi->origin && + (gisa_get_ipm_or_restore_iam(gi) & + vcpu->arch.sie_block->gcr[6] >> 24)) + return 0; + if (!ckc_interrupts_enabled(vcpu) && !cpu_timer_interrupts_enabled(vcpu)) { VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); @@ -1533,18 +1628,19 @@ static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm, static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; unsigned long active_mask; int isc; if (schid) goto out; - if (!kvm->arch.gisa) + if (!gi->origin) goto out; - active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32; + active_mask = (isc_mask & gisa_get_ipm(gi->origin) << 24) << 32; while (active_mask) { isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); - if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) + if (gisa_tac_ipm_gisc(gi->origin, isc)) return isc; clear_bit_inv(isc, &active_mask); } @@ -1567,6 +1663,7 @@ out: struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 isc_mask, u32 schid) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti, *tmp_inti; int isc; @@ -1584,7 +1681,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, /* both types of interrupts present */ if (int_word_to_isc(inti->io.io_int_word) <= isc) { /* classical IO int with higher priority */ - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); goto out; } gisa_out: @@ -1596,7 +1693,7 @@ gisa_out: kvm_s390_reinject_io_int(kvm, inti); inti = tmp_inti; } else - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); out: return inti; } @@ -1685,6 +1782,7 @@ static int __inject_float_mchk(struct kvm *kvm, static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_float_interrupt *fi; struct list_head *list; int isc; @@ -1692,9 +1790,9 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); - if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { + if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); - kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); + gisa_set_ipm_gisc(gi->origin, isc); kfree(inti); return 0; } @@ -1726,7 +1824,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) */ static void __floating_irq_kick(struct kvm *kvm, u64 type) { - struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; struct kvm_vcpu *dst_vcpu; int sigcpu, online_vcpus, nr_tries = 0; @@ -1735,11 +1832,11 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) return; /* find idle VCPUs first, then round robin */ - sigcpu = find_first_bit(fi->idle_mask, online_vcpus); + sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus); if (sigcpu == online_vcpus) { do { - sigcpu = fi->next_rr_cpu; - fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus; + sigcpu = kvm->arch.float_int.next_rr_cpu++; + kvm->arch.float_int.next_rr_cpu %= online_vcpus; /* avoid endless loops if all vcpus are stopped */ if (nr_tries++ >= online_vcpus) return; @@ -1753,7 +1850,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (!(type & KVM_S390_INT_IO_AI_MASK && kvm->arch.gisa)) + if (!(type & KVM_S390_INT_IO_AI_MASK && + kvm->arch.gisa_int.origin)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -2003,6 +2101,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) { + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_s390_interrupt_info *inti; struct kvm_s390_float_interrupt *fi; struct kvm_s390_irq *buf; @@ -2026,15 +2125,14 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) max_irqs = len / sizeof(struct kvm_s390_irq); - if (kvm->arch.gisa && - kvm_s390_gisa_get_ipm(kvm->arch.gisa)) { + if (gi->origin && gisa_get_ipm(gi->origin)) { for (i = 0; i <= MAX_ISC; i++) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; goto out_nolock; } - if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { + if (gisa_tac_ipm_gisc(gi->origin, i)) { irq = (struct kvm_s390_irq *) &buf[n]; irq->type = KVM_S390_INT_IO(1, 0, 0, 0); irq->u.io.io_int_word = isc_to_int_word(i); @@ -2831,7 +2929,7 @@ static void store_local_irq(struct kvm_s390_local_interrupt *li, int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) { int scn; - unsigned long sigp_emerg_pending[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; unsigned long pending_irqs; struct kvm_s390_irq irq; @@ -2884,27 +2982,278 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) return n; } -void kvm_s390_gisa_clear(struct kvm *kvm) +static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - if (kvm->arch.gisa) { - memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); - kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); + int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + + for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_id); + if (psw_ioint_disabled(vcpu)) + continue; + deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask) { + /* lately kicked but not yet running */ + if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + return; + kvm_s390_vcpu_wakeup(vcpu); + return; + } } } +static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) +{ + struct kvm_s390_gisa_interrupt *gi = + container_of(timer, struct kvm_s390_gisa_interrupt, timer); + struct kvm *kvm = + container_of(gi->origin, struct sie_page2, gisa)->kvm; + u8 pending_mask; + + pending_mask = gisa_get_ipm_or_restore_iam(gi); + if (pending_mask) { + __airqs_kick_single_vcpu(kvm, pending_mask); + hrtimer_forward_now(timer, ns_to_ktime(gi->expires)); + return HRTIMER_RESTART; + }; + + return HRTIMER_NORESTART; +} + +#define NULL_GISA_ADDR 0x00000000UL +#define NONE_GISA_ADDR 0x00000001UL +#define GISA_ADDR_MASK 0xfffff000UL + +static void process_gib_alert_list(void) +{ + struct kvm_s390_gisa_interrupt *gi; + struct kvm_s390_gisa *gisa; + struct kvm *kvm; + u32 final, origin = 0UL; + + do { + /* + * If the NONE_GISA_ADDR is still stored in the alert list + * origin, we will leave the outer loop. No further GISA has + * been added to the alert list by millicode while processing + * the current alert list. + */ + final = (origin & NONE_GISA_ADDR); + /* + * Cut off the alert list and store the NONE_GISA_ADDR in the + * alert list origin to avoid further GAL interruptions. + * A new alert list can be build up by millicode in parallel + * for guests not in the yet cut-off alert list. When in the + * final loop, store the NULL_GISA_ADDR instead. This will re- + * enable GAL interruptions on the host again. + */ + origin = xchg(&gib->alert_list_origin, + (!final) ? NONE_GISA_ADDR : NULL_GISA_ADDR); + /* + * Loop through the just cut-off alert list and start the + * gisa timers to kick idle vcpus to consume the pending + * interruptions asap. + */ + while (origin & GISA_ADDR_MASK) { + gisa = (struct kvm_s390_gisa *)(u64)origin; + origin = gisa->next_alert; + gisa->next_alert = (u32)(u64)gisa; + kvm = container_of(gisa, struct sie_page2, gisa)->kvm; + gi = &kvm->arch.gisa_int; + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + } + } while (!final); + +} + +void kvm_s390_gisa_clear(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return; + gisa_clear_ipm(gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); +} + void kvm_s390_gisa_init(struct kvm *kvm) { - if (css_general_characteristics.aiv) { - kvm->arch.gisa = &kvm->arch.sie_page2->gisa; - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); - kvm_s390_gisa_clear(kvm); - } + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!css_general_characteristics.aiv) + return; + gi->origin = &kvm->arch.sie_page2->gisa; + gi->alert.mask = 0; + spin_lock_init(&gi->alert.ref_lock); + gi->expires = 50 * 1000; /* 50 usec */ + hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gi->timer.function = gisa_vcpu_kicker; + memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); + gi->origin->next_alert = (u32)(u64)gi->origin; + VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } void kvm_s390_gisa_destroy(struct kvm *kvm) { - if (!kvm->arch.gisa) + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return; + if (gi->alert.mask) + KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", + kvm, gi->alert.mask); + while (gisa_in_alert_list(gi->origin)) + cpu_relax(); + hrtimer_cancel(&gi->timer); + gi->origin = NULL; +} + +/** + * kvm_s390_gisc_register - register a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function extends the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + */ +int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + gi->alert.ref_count[gisc]++; + if (gi->alert.ref_count[gisc] == 1) { + gi->alert.mask |= 0x80 >> gisc; + gisa_set_iam(gi->origin, gi->alert.mask); + } + spin_unlock(&gi->alert.ref_lock); + + return gib->nisc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_register); + +/** + * kvm_s390_gisc_unregister - unregister a guest ISC + * + * @kvm: the kernel vm to work with + * @gisc: the guest interruption sub class to register + * + * The function reduces the vm specific alert mask to use. + * The effective IAM mask in the GISA is updated as well + * in case the GISA is not part of the GIB alert list. + * It will be updated latest when the IAM gets restored + * by gisa_get_ipm_or_restore_iam(). + * + * Returns: the nonspecific ISC (NISC) the gib alert mechanism + * has registered with the channel subsystem. + * -ENODEV in case the vm uses no GISA + * -ERANGE in case the guest ISC is invalid + * -EINVAL in case the guest ISC is not registered + */ +int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + int rc = 0; + + if (!gi->origin) + return -ENODEV; + if (gisc > MAX_ISC) + return -ERANGE; + + spin_lock(&gi->alert.ref_lock); + if (gi->alert.ref_count[gisc] == 0) { + rc = -EINVAL; + goto out; + } + gi->alert.ref_count[gisc]--; + if (gi->alert.ref_count[gisc] == 0) { + gi->alert.mask &= ~(0x80 >> gisc); + gisa_set_iam(gi->origin, gi->alert.mask); + } +out: + spin_unlock(&gi->alert.ref_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); + +static void gib_alert_irq_handler(struct airq_struct *airq) +{ + inc_irq_stat(IRQIO_GAL); + process_gib_alert_list(); +} + +static struct airq_struct gib_alert_irq = { + .handler = gib_alert_irq_handler, + .lsi_ptr = &gib_alert_irq.lsi_mask, +}; + +void kvm_s390_gib_destroy(void) +{ + if (!gib) return; - kvm->arch.gisa = NULL; + chsc_sgib(0); + unregister_adapter_interrupt(&gib_alert_irq); + free_page((unsigned long)gib); + gib = NULL; +} + +int kvm_s390_gib_init(u8 nisc) +{ + int rc = 0; + + if (!css_general_characteristics.aiv) { + KVM_EVENT(3, "%s", "gib not initialized, no AIV facility"); + goto out; + } + + gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!gib) { + rc = -ENOMEM; + goto out; + } + + gib_alert_irq.isc = nisc; + if (register_adapter_interrupt(&gib_alert_irq)) { + pr_err("Registering the GIB alert interruption handler failed\n"); + rc = -EIO; + goto out_free_gib; + } + + gib->nisc = nisc; + if (chsc_sgib((u32)(u64)gib)) { + pr_err("Associating the GIB with the AIV facility failed\n"); + free_page((unsigned long)gib); + gib = NULL; + rc = -EIO; + goto out_unreg_gal; + } + + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + goto out; + +out_unreg_gal: + unregister_adapter_interrupt(&gib_alert_irq); +out_free_gib: + free_page((unsigned long)gib); + gib = NULL; +out: + return rc; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7f4bc58a53b9..4638303ba6a8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -432,11 +432,18 @@ int kvm_arch_init(void *opaque) /* Register floating interrupt controller interface. */ rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { - pr_err("Failed to register FLIC rc=%d\n", rc); + pr_err("A FLIC registration call failed with rc=%d\n", rc); goto out_debug_unreg; } + + rc = kvm_s390_gib_init(GAL_ISC); + if (rc) + goto out_gib_destroy; + return 0; +out_gib_destroy: + kvm_s390_gib_destroy(); out_debug_unreg: debug_unregister(kvm_s390_dbf); return rc; @@ -444,6 +451,7 @@ out_debug_unreg: void kvm_arch_exit(void) { + kvm_s390_gib_destroy(); debug_unregister(kvm_s390_dbf); } @@ -1258,11 +1266,65 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, static int kvm_s390_set_processor_subfunc(struct kvm *kvm, struct kvm_device_attr *attr) { - /* - * Once supported by kernel + hw, we have to store the subfunctions - * in kvm->arch and remember that user space configured them. - */ - return -ENXIO; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + if (copy_from_user(&kvm->arch.model.subfuncs, (void __user *)attr->addr, + sizeof(struct kvm_s390_vm_cpu_subfunc))) { + mutex_unlock(&kvm->lock); + return -EFAULT; + } + mutex_unlock(&kvm->lock); + + VM_EVENT(kvm, 3, "SET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "SET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "SET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "SET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "SET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "SET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "SET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "SET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "SET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "SET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "SET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "SET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "SET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + + return 0; } static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) @@ -1381,12 +1443,56 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, static int kvm_s390_get_processor_subfunc(struct kvm *kvm, struct kvm_device_attr *attr) { - /* - * Once we can actually configure subfunctions (kernel + hw support), - * we have to check if they were already set by user space, if so copy - * them from kvm->arch. - */ - return -ENXIO; + if (copy_to_user((void __user *)attr->addr, &kvm->arch.model.subfuncs, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + + VM_EVENT(kvm, 3, "GET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2], + ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]); + VM_EVENT(kvm, 3, "GET: guest PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]); + VM_EVENT(kvm, 3, "GET: guest KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]); + VM_EVENT(kvm, 3, "GET: guest KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]); + VM_EVENT(kvm, 3, "GET: guest KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.km)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]); + VM_EVENT(kvm, 3, "GET: guest KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]); + VM_EVENT(kvm, 3, "GET: guest KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]); + VM_EVENT(kvm, 3, "GET: guest PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: guest KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: guest KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]); + VM_EVENT(kvm, 3, "GET: guest KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]); + VM_EVENT(kvm, 3, "GET: guest PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]); + VM_EVENT(kvm, 3, "GET: guest PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]); + VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], + ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); + + return 0; } static int kvm_s390_get_machine_subfunc(struct kvm *kvm, @@ -1395,8 +1501,55 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, sizeof(struct kvm_s390_vm_cpu_subfunc))) return -EFAULT; + + VM_EVENT(kvm, 3, "GET: host PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.plo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[1], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[2], + ((unsigned long *) &kvm_s390_available_subfunc.plo)[3]); + VM_EVENT(kvm, 3, "GET: host PTFF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ptff)[1]); + VM_EVENT(kvm, 3, "GET: host KMAC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmac)[1]); + VM_EVENT(kvm, 3, "GET: host KMC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmc)[1]); + VM_EVENT(kvm, 3, "GET: host KM subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.km)[0], + ((unsigned long *) &kvm_s390_available_subfunc.km)[1]); + VM_EVENT(kvm, 3, "GET: host KIMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kimd)[1]); + VM_EVENT(kvm, 3, "GET: host KLMD subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[0], + ((unsigned long *) &kvm_s390_available_subfunc.klmd)[1]); + VM_EVENT(kvm, 3, "GET: host PCKMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[1]); + VM_EVENT(kvm, 3, "GET: host KMCTR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[1]); + VM_EVENT(kvm, 3, "GET: host KMF subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmf)[1]); + VM_EVENT(kvm, 3, "GET: host KMO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kmo)[1]); + VM_EVENT(kvm, 3, "GET: host PCC subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pcc)[1]); + VM_EVENT(kvm, 3, "GET: host PPNO subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[0], + ((unsigned long *) &kvm_s390_available_subfunc.ppno)[1]); + VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.kma)[0], + ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]); + return 0; } + static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1514,10 +1667,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_FEAT: case KVM_S390_VM_CPU_MACHINE_FEAT: case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: ret = 0; break; - /* configuring subfunctions is not supported yet */ - case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: default: ret = -ENXIO; break; @@ -2209,6 +2361,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.sie_page2) goto out_err; + kvm->arch.sie_page2->kvm = kvm; kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; for (i = 0; i < kvm_s390_fac_size(); i++) { @@ -2218,6 +2371,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] & kvm_s390_fac_base[i]; } + kvm->arch.model.subfuncs = kvm_s390_available_subfunc; /* we are always in czam mode - even on pre z14 machines */ set_kvm_facility(kvm->arch.model.fac_mask, 138); @@ -2812,7 +2966,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa; + vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; if (vcpu->arch.sie_block->gd && sclp.has_gisaf) vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); @@ -3458,6 +3612,8 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } + clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); @@ -4293,12 +4449,12 @@ static int __init kvm_s390_init(void) int i; if (!sclp.has_sief2) { - pr_info("SIE not available\n"); + pr_info("SIE is not available\n"); return -ENODEV; } if (nested && hpage) { - pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently"); + pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); return -EINVAL; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 1f6e36cdce0d..6d9448dbd052 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -67,7 +67,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); + return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) @@ -381,6 +381,8 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, void kvm_s390_gisa_init(struct kvm *kvm); void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); +int kvm_s390_gib_init(u8 nisc); +void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 180373360e34..a5db4475e72d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include <asm/msr-index.h> #include <asm/asm.h> #include <asm/kvm_page_track.h> +#include <asm/kvm_vcpu_regs.h> #include <asm/hyperv-tlfs.h> #define KVM_MAX_VCPUS 288 @@ -137,23 +138,23 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define ASYNC_PF_PER_VCPU 64 enum kvm_reg { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, + VCPU_REGS_RAX = __VCPU_REGS_RAX, + VCPU_REGS_RCX = __VCPU_REGS_RCX, + VCPU_REGS_RDX = __VCPU_REGS_RDX, + VCPU_REGS_RBX = __VCPU_REGS_RBX, + VCPU_REGS_RSP = __VCPU_REGS_RSP, + VCPU_REGS_RBP = __VCPU_REGS_RBP, + VCPU_REGS_RSI = __VCPU_REGS_RSI, + VCPU_REGS_RDI = __VCPU_REGS_RDI, #ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, + VCPU_REGS_R8 = __VCPU_REGS_R8, + VCPU_REGS_R9 = __VCPU_REGS_R9, + VCPU_REGS_R10 = __VCPU_REGS_R10, + VCPU_REGS_R11 = __VCPU_REGS_R11, + VCPU_REGS_R12 = __VCPU_REGS_R12, + VCPU_REGS_R13 = __VCPU_REGS_R13, + VCPU_REGS_R14 = __VCPU_REGS_R14, + VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, NR_VCPU_REGS @@ -319,6 +320,7 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; bool unsync; + bool mmio_cached; /* * The following two entries are used to key the shadow page in the @@ -333,10 +335,6 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - - /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ - unsigned long mmu_valid_gen; - DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 @@ -848,13 +846,11 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; - unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; - struct list_head zapped_obsolete_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -1255,7 +1251,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h new file mode 100644 index 000000000000..1af2cb59233b --- /dev/null +++ b/arch/x86/include/asm/kvm_vcpu_regs.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_VCPU_REGS_H +#define _ASM_X86_KVM_VCPU_REGS_H + +#define __VCPU_REGS_RAX 0 +#define __VCPU_REGS_RCX 1 +#define __VCPU_REGS_RDX 2 +#define __VCPU_REGS_RBX 3 +#define __VCPU_REGS_RSP 4 +#define __VCPU_REGS_RBP 5 +#define __VCPU_REGS_RSI 6 +#define __VCPU_REGS_RDI 7 + +#ifdef CONFIG_X86_64 +#define __VCPU_REGS_R8 8 +#define __VCPU_REGS_R9 9 +#define __VCPU_REGS_R10 10 +#define __VCPU_REGS_R11 11 +#define __VCPU_REGS_R12 12 +#define __VCPU_REGS_R13 13 +#define __VCPU_REGS_R14 14 +#define __VCPU_REGS_R15 15 +#endif + +#endif /* _ASM_X86_KVM_VCPU_REGS_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e811d4d1c824..904494b924c1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -104,12 +104,8 @@ static u64 kvm_sched_clock_read(void) static inline void kvm_sched_clock_init(bool stable) { - if (!stable) { - pv_ops.time.sched_clock = kvm_clock_read; + if (!stable) clear_sched_clock_stable(); - return; - } - kvm_sched_clock_offset = kvm_clock_read(); pv_ops.time.sched_clock = kvm_sched_clock_read; @@ -355,6 +351,20 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); + + /* + * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * Invariant TSC exposed by host means kvmclock is not necessary: + * can use TSC as clocksource. + * + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + !check_tsc_unstable()) + kvm_clock.rating = 299; + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c07958b59f50..fd3951638ae4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -405,7 +405,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 89d20ed1d2e8..27c43525a05f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index af192895b1fc..4a6dc54cc12b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_t pid_nr; int ret; - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); + pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT); if (!pit) return NULL; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bdcd4139eca9..8b38bb4868a6 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm) struct kvm_pic *s; int ret; - s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT); if (!s) return -ENOMEM; spin_lock_init(&s->lock); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4e822ad363f3..1add1bc881e2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm) struct kvm_ioapic *ioapic; int ret; - ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT); if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b6c2da7265c..991fdf7fc17f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), + GFP_KERNEL_ACCOUNT); if (!new) goto out; @@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) ASSERT(vcpu != NULL); apic_debug("apic_init %d\n", vcpu->vcpu_id); - apic = kzalloc(sizeof(*apic), GFP_KERNEL); + apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; vcpu->arch.apic = apic; - apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f2d1d230d5b8..7837ab001d80 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -109,9 +109,11 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif #define PT64_LVL_ADDR_MASK(level) \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT64_LEVEL_BITS))) - 1)) @@ -330,53 +332,56 @@ static inline bool is_access_track_spte(u64 spte) } /* - * the low bit of the generation number is always presumed to be zero. - * This disables mmio caching during memslot updates. The concept is - * similar to a seqcount but instead of retrying the access we just punt - * and ignore the cache. + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: * - * spte bits 3-11 are used as bits 1-9 of the generation number, - * the bits 52-61 are used as bits 10-19 of the generation number. + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 2 -#define MMIO_SPTE_GEN_HIGH_SHIFT 52 +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) -#define MMIO_GEN_SHIFT 20 -#define MMIO_GEN_LOW_SHIFT 10 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) -#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) -static u64 generation_mmio_spte_mask(unsigned int gen) +#define MMIO_SPTE_GEN_HIGH_START 52 +#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) +static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_GEN_MASK); + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; } -static unsigned int get_mmio_spte_generation(u64 spte) +static u64 get_mmio_spte_generation(u64 spte) { - unsigned int gen; + u64 gen; spte &= ~shadow_mmio_mask; - gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; - gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -386,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; + page_header(__pa(sptep))->mmio_cached = true; + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -407,7 +414,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) & ~PAGE_MASK; } @@ -424,9 +431,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - unsigned int kvm_gen, spte_gen; + u64 kvm_gen, spte_gen, gen; - kvm_gen = kvm_current_mmio_generation(vcpu); + gen = kvm_vcpu_memslots(vcpu)->generation; + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) + return false; + + kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -959,7 +970,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); if (!obj) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -2049,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * The active_mmu_pages list is the FIFO list, do not move the - * page until it is zapped. kvm_zap_obsolete_pages depends on - * this feature. See the comments in kvm_zap_obsolete_pages(). - */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2195,23 +2200,15 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list); +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -/* - * NOTE: we should pay more attention on the zapped-obsolete page - * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk - * since it has been deleted from active_mmu_pages but still can be found - * at hast list. - * - * for_each_valid_sp() has skipped that kind of pages. - */ #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + if ((_sp)->role.invalid) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2231,18 +2228,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return true; } +static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, + struct list_head *invalid_list, + bool remote_flush) +{ + if (!remote_flush && !list_empty(invalid_list)) + return false; + + if (!list_empty(invalid_list)) + kvm_mmu_commit_zap_page(kvm, invalid_list); + else + kvm_flush_remote_tlbs(kvm); + return true; +} + static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, struct list_head *invalid_list, bool remote_flush, bool local_flush) { - if (!list_empty(invalid_list)) { - kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) return; - } - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) + if (local_flush) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } @@ -2253,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2482,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -2668,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list, + int *nr_zapped) { - int ret; + bool list_unstable; trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp, invalid_list); + *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + /* Zapping children means active_mmu_pages has become unstable. */ + list_unstable = *nr_zapped; + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp); @@ -2686,22 +2692,27 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ - ret++; + (*nr_zapped)++; list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - /* - * The obsolete pages can not be used on any vcpus. - * See the comments in kvm_mmu_invalidate_zap_all_pages(). - */ - if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + if (!sp->role.invalid) kvm_reload_remote_mmus(kvm); } sp->role.invalid = 1; - return ret; + return list_unstable; +} + +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + int nr_zapped; + + __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); + return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -3703,7 +3714,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) u64 *lm_root; - lm_root = (void*)get_zeroed_page(GFP_KERNEL); + lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (lm_root == NULL) return 1; @@ -4204,14 +4215,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; if (cached_root_available(vcpu, new_cr3, new_role)) { - /* - * It is possible that the cached previous root page is - * obsolete because of a change in the MMU - * generation number. However, that is accompanied by - * KVM_REQ_MMU_RELOAD, which will free the root that we - * have set here and allocate a new one. - */ - kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -5486,6 +5489,76 @@ void kvm_disable_tdp(void) } EXPORT_SYMBOL_GPL(kvm_disable_tdp); + +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); + +/* The caller should hold mmu-lock before calling this function. */ +static __always_inline bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) +{ + struct slot_rmap_walk_iterator iterator; + bool flush = false; + + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + + return flush; +} + +static __always_inline bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static __always_inline bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} + static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu->pae_root); @@ -5505,7 +5578,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; @@ -5543,105 +5616,62 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - kvm_mmu_invalidate_zap_all_pages(kvm); -} - -void kvm_mmu_init_vm(struct kvm *kvm) -{ - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - - node->track_write = kvm_mmu_pte_write; - node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; - kvm_page_track_register_notifier(kvm, node); -} + struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); + unsigned long i; + bool flush; + gfn_t gfn; -void kvm_mmu_uninit_vm(struct kvm *kvm) -{ - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + spin_lock(&kvm->mmu_lock); - kvm_page_track_unregister_notifier(kvm, node); -} + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_unlock; -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); + flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) -{ - struct slot_rmap_walk_iterator iterator; - bool flush = false; + for (i = 0; i < slot->npages; i++) { + gfn = slot->base_gfn + i; - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); + for_each_valid_sp(kvm, sp, gfn) { + if (sp->gfn != gfn) + continue; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; cond_resched_lock(&kvm->mmu_lock); } } + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - - return flush; +out_unlock: + spin_unlock(&kvm->mmu_lock); } -static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) +void kvm_mmu_init_vm(struct kvm *kvm) { - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); -} + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; -static __always_inline bool -slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; + kvm_page_track_register_notifier(kvm, node); } -static __always_inline bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) +void kvm_mmu_uninit_vm(struct kvm *kvm) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; -static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); + kvm_page_track_unregister_notifier(kvm, node); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool flush_tlb = true; - bool flush = false; int i; - if (kvm_available_flush_tlb_with_range()) - flush_tlb = false; - spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5653,17 +5683,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush |= slot_handle_level_range(kvm, memslot, - kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, start, - end - 1, flush_tlb); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start + 1); - spin_unlock(&kvm->mmu_lock); } @@ -5815,101 +5840,58 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -#define BATCH_ZAP_PAGES 10 -static void kvm_zap_obsolete_pages(struct kvm *kvm) +static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) { struct kvm_mmu_page *sp, *node; - int batch = 0; + LIST_HEAD(invalid_list); + int ign; + spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe_reverse(sp, node, - &kvm->arch.active_mmu_pages, link) { - int ret; - - /* - * No obsolete page exists before new created page since - * active_mmu_pages is the FIFO list. - */ - if (!is_obsolete_sp(kvm, sp)) - break; - - /* - * Since we are reversely walking the list and the invalid - * list will be moved to the head, skip the invalid page - * can help us to avoid the infinity list walking. - */ - if (sp->role.invalid) + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (mmio_only && !sp->mmio_cached) continue; - - /* - * Need not flush tlb since we only zap the sp with invalid - * generation number. - */ - if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { - batch = 0; + if (sp->role.invalid && sp->root_count) + continue; + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { + WARN_ON_ONCE(mmio_only); goto restart; } - - ret = kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages); - batch += ret; - - if (ret) + if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } - /* - * Should flush tlb before free page tables since lockless-walking - * may use the pages. - */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); -} - -/* - * Fast invalidate all shadow pages and use lock-break technique - * to zap obsolete pages. - * - * It's required when memslot is being deleted or VM is being - * destroyed, in these cases, we should ensure that KVM MMU does - * not use any resource of the being-deleted slot or all slots - * after calling the function. - */ -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) -{ - spin_lock(&kvm->mmu_lock); - trace_kvm_mmu_invalidate_zap_all_pages(kvm); - kvm->arch.mmu_valid_gen++; - - /* - * Notify all vcpus to reload its shadow page table - * and flush TLB. Then all vcpus will switch to new - * shadow page table with the new mmu_valid_gen. - * - * Note: we should do this under the protection of - * mmu-lock, otherwise, vcpu would purge shadow page - * but miss tlb flush. - */ - kvm_reload_remote_mmus(kvm); - - kvm_zap_obsolete_pages(kvm); + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +void kvm_mmu_zap_all(struct kvm *kvm) { - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); + return __kvm_mmu_zap_all(kvm, false); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + + gen &= MMIO_SPTE_GEN_MASK; + /* - * The very rare case: if the generation-number is round, + * Generation numbers are incremented in multiples of the number of + * address spaces in order to provide unique generations across all + * address spaces. Strip what is effectively the address space + * modifier prior to checking for a wrap of the MMIO generation so + * that a wrap in any address space is detected. + */ + gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + + /* + * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ - if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { + if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_invalidate_zap_all_pages(kvm); + __kvm_mmu_zap_all(kvm, true); } } @@ -5940,24 +5922,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) + if (!kvm->arch.n_used_mmu_pages) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); -unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7b333147c4a..bbdc60f2fae8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c73bf4e4988c..9f6c855a0043 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,18 +8,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -31,9 +29,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ - __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -283,27 +280,6 @@ TRACE_EVENT( ); TRACE_EVENT( - kvm_mmu_invalidate_zap_all_pages, - TP_PROTO(struct kvm *kvm), - TP_ARGS(kvm), - - TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) - __field(unsigned int, mmu_used_pages) - ), - - TP_fast_assign( - __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; - ), - - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", - __entry->mmu_valid_gen, __entry->mmu_used_pages - ) -); - - -TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), TP_ARGS(spte, kvm_gen, spte_gen), diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 3052a59a3065..fd04d462fdae 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f13a3a24d360..b5b128a0a051 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -145,7 +145,6 @@ struct kvm_svm { /* Struct members for AVIC */ u32 avic_vm_id; - u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; struct hlist_node hnode; @@ -236,6 +235,7 @@ struct vcpu_svm { bool nrips_enabled : 1; u32 ldr_reg; + u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; @@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = vmalloc(size); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); else - pages = kmalloc(size, GFP_KERNEL); + pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) return NULL; @@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm, static struct kvm *svm_vm_alloc(void) { - struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); + struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_svm->kvm; } @@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL); + p_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!p_page) goto free_avic; @@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL); + l_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!l_page) goto free_avic; @@ -2106,6 +2109,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) INIT_LIST_HEAD(&svm->ir_list); spin_lock_init(&svm->ir_list_lock); + svm->dfr_reg = APIC_DFR_FLAT; return ret; } @@ -2119,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; goto out; } - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!svm->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -2137,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto free_svm; err = -ENOMEM; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) goto uninit; - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) goto free_page1; - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) goto free_page2; - hsave_page = alloc_page(GFP_KERNEL); + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!hsave_page) goto free_page3; @@ -4565,8 +4570,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return &logical_apic_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, - bool valid) +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -4579,31 +4583,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); - if (valid) - new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; - else - new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); return 0; } +static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool flat = svm->dfr_reg == APIC_DFR_FLAT; + u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + + if (entry) + WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); +} + static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret; + int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); - if (!ldr) - return 1; + if (ldr == svm->ldr_reg) + return 0; - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); - if (ret && svm->ldr_reg) { - avic_ldr_write(vcpu, 0, svm->ldr_reg, false); - svm->ldr_reg = 0; - } else { + avic_invalidate_logical_id_entry(vcpu); + + if (ldr) + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + + if (!ret) svm->ldr_reg = ldr; - } + return ret; } @@ -4637,27 +4649,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) return 0; } -static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); - u32 mod = (dfr >> 28) & 0xf; - /* - * We assume that all local APICs are using the same type. - * If this changes, we need to flush the AVIC logical - * APID id table. - */ - if (kvm_svm->ldr_mode == mod) - return 0; - - clear_page(page_address(kvm_svm->avic_logical_id_table_page)); - kvm_svm->ldr_mode = mod; + if (svm->dfr_reg == dfr) + return; - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - return 0; + avic_invalidate_logical_id_entry(vcpu); + svm->dfr_reg = dfr; } static int avic_unaccel_trap_write(struct vcpu_svm *svm) @@ -5125,11 +5126,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!kvm_vcpu_apicv_active(&svm->vcpu)) - return; - - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - mark_dirty(vmcb, VMCB_INTR); + if (kvm_vcpu_apicv_active(vcpu)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_AVIC); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -5195,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -6163,8 +6164,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; - if (avic_handle_dfr_update(vcpu) != 0) - return; + avic_handle_dfr_update(vcpu); avic_handle_ldr_update(vcpu); } @@ -6311,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) if (ret) return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6361,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - start = kzalloc(sizeof(*start), GFP_KERNEL); + start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); if (!start) return -ENOMEM; @@ -6458,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6535,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6597,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6618,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6646,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, struct sev_data_dbg *data; int ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6901,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) } ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) goto e_unpin_memory; @@ -7007,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm, if (range->addr > ULONG_MAX || range->size > ULONG_MAX) return -EINVAL; - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); if (!region) return -ENOMEM; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d737a51a53ca..f24a2c225070 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); @@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); + vmx_leave_nested(vcpu); vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); free_nested(vcpu); vcpu_put(vcpu); @@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) prepare_vmcs02_early_full(vmx, vmcs12); /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - - /* * PIN CONTROLS */ exec_control = vmcs12->pin_based_vm_exec_control; @@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. @@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; + bool vm_fail; if (!nested_early_check) return 0; @@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - vmx->__launched = vmx->loaded_vmcs->launched; - asm( - /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" + "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "je 1f \n\t" + __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" + "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" + /* + * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set + * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail + * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the + * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. + */ "call vmx_vmenter\n\t" - /* Set vmx->fail accordingly */ - "setbe %c[fail](%% " _ASM_CX")\n\t" - : ASM_CALL_CONSTRAINT - : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + CC_SET(be) + : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) + : [HOST_RSP]"r"((unsigned long)HOST_RSP), + [loaded_vmcs]"r"(vmx->loaded_vmcs), + [launched]"i"(offsetof(struct loaded_vmcs, launched)), + [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) - : "rax", "cc", "memory" + : "cc", "memory" ); preempt_enable(); @@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - if (vmx->fail) { + if (vm_fail) { WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; return 1; } @@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); @@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) kvm_make_request(KVM_REQ_EVENT, vcpu); /* + * Do not start the preemption timer hrtimer until after we know + * we are successful, so that only nested_vmx_vmexit needs to cancel + * the timer. + */ + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set @@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + if (nested_cpu_has_preemption_timer(vmcs12) && + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; @@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_flush_tlb(vcpu, true); } - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) off += kvm_register_read(vcpu, index_reg)<<scaling; vmx_get_segment(vcpu, &s, seg_reg); - *ret = s.base + off; + /* + * The effective address, i.e. @off, of a memory operand is truncated + * based on the address size of the instruction. Note that this is + * the *effective address*, i.e. the address prior to accounting for + * the segment's base. + */ if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; + off &= 0xffffffff; + else if (addr_size == 0) /* 16 bit */ + off &= 0xffff; /* Checks for #GP/#SS exceptions. */ exn = false; if (is_long_mode(vcpu)) { + /* + * The virtual/linear address is never truncated in 64-bit + * mode, e.g. a 32-bit address size can yield a 64-bit virtual + * address when using FS/GS with a non-zero base. + */ + *ret = s.base + off; + /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu); - } else if (is_protmode(vcpu)) { + } else { + /* + * When not in long mode, the virtual/linear address is + * unconditionally truncated to 32 bits regardless of the + * address size. + */ + *ret = (s.base + off) & 0xffffffff; + /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) @@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. */ - exn = exn || (off + sizeof(u64) > s.limit); + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || (off + sizeof(u64) > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, @@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { + /* + * The vmx_bitmap is not tied to a VM and so should + * not be charged to a memcg. + */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6def3ba88e3b..cb6079f8a227 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -34,6 +34,7 @@ struct vmcs_host_state { unsigned long cr4; /* May not match real cr4 */ unsigned long gs_base; unsigned long fs_base; + unsigned long rsp; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bcef2c7e9bc4..7b272738c576 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -1,6 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/asm.h> +#include <asm/bitsperlong.h> +#include <asm/kvm_vcpu_regs.h> + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE +#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE +#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE +#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +/* Intentionally omit RSP as it's context switched by hardware */ +#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE +#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE +#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE + +#ifdef CONFIG_X86_64 +#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE +#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE +#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE +#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE +#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE +#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE +#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE +#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#endif .text @@ -55,3 +79,146 @@ ENDPROC(vmx_vmenter) ENTRY(vmx_vmexit) ret ENDPROC(vmx_vmexit) + +/** + * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * @vmx: struct vcpu_vmx * + * @regs: unsigned long * (to guest registers) + * @launched: %true if the VMCS has been launched + * + * Returns: + * 0 on VM-Exit, 1 on VM-Fail + */ +ENTRY(__vmx_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + push %_ASM_ARG2 + + /* Copy @launched to BL, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3B, %bl + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 + call vmx_update_host_rsp + + /* Load @regs to RAX. */ + mov (%_ASM_SP), %_ASM_AX + + /* Check if vmlaunch or vmresume is needed */ + cmpb $0, %bl + + /* Load guest registers. Don't clobber flags. */ + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RCX(%_ASM_AX), %_ASM_CX + mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RSI(%_ASM_AX), %_ASM_SI + mov VCPU_RDI(%_ASM_AX), %_ASM_DI + mov VCPU_RBP(%_ASM_AX), %_ASM_BP +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_AX), %r8 + mov VCPU_R9 (%_ASM_AX), %r9 + mov VCPU_R10(%_ASM_AX), %r10 + mov VCPU_R11(%_ASM_AX), %r11 + mov VCPU_R12(%_ASM_AX), %r12 + mov VCPU_R13(%_ASM_AX), %r13 + mov VCPU_R14(%_ASM_AX), %r14 + mov VCPU_R15(%_ASM_AX), %r15 +#endif + /* Load guest RAX. This kills the vmx_vcpu pointer! */ + mov VCPU_RAX(%_ASM_AX), %_ASM_AX + + /* Enter guest mode */ + call vmx_vmenter + + /* Jump on VM-Fail. */ + jbe 2f + + /* Temporarily save guest's RAX. */ + push %_ASM_AX + + /* Reload @regs to RAX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_AX + + /* Save all guest registers, including RAX from the stack */ + __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) +#ifdef CONFIG_X86_64 + mov %r8, VCPU_R8 (%_ASM_AX) + mov %r9, VCPU_R9 (%_ASM_AX) + mov %r10, VCPU_R10(%_ASM_AX) + mov %r11, VCPU_R11(%_ASM_AX) + mov %r12, VCPU_R12(%_ASM_AX) + mov %r13, VCPU_R13(%_ASM_AX) + mov %r14, VCPU_R14(%_ASM_AX) + mov %r15, VCPU_R15(%_ASM_AX) +#endif + + /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %eax, %eax + + /* + * Clear all general purpose registers except RSP and RAX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RAX are exempt as RSP is restored by hardware during + * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. + */ +1: xor %ebx, %ebx + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp +#ifdef CONFIG_X86_64 + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d +#endif + + /* "POP" @regs. */ + add $WORD_SIZE, %_ASM_SP + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif + pop %_ASM_BP + ret + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ +2: mov $1, %eax + jmp 1b +ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 30a6bcd735ec..c73375e01ab8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + /* + * This allocation for vmx_l1d_flush_pages is not tied to a VM + * lifetime and so should not be charged to a memcg. + */ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM; @@ -2387,13 +2391,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, flags, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -2440,7 +2444,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_init(loaded_vmcs); if (cpu_has_vmx_msr_bitmap()) { - loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + loaded_vmcs->msr_bitmap = (unsigned long *) + __get_free_page(GFP_KERNEL_ACCOUNT); if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); @@ -2481,7 +2486,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(false, cpu); + vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -6360,150 +6365,15 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } -static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { - unsigned long evmcs_rsp; - - vmx->__launched = vmx->loaded_vmcs->launched; - - evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? - (unsigned long)¤t_evmcs->host_rsp : 0; - - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - - asm( - /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ - "push %%" _ASM_CX " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" - /* Avoid VMWRITE when Enlightened VMCS is in use */ - "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "jz 2f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" - "jmp 1f \n\t" - "2: \n\t" - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "1: \n\t" - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ - - /* Reload cr2 if changed */ - "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 3f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "3: \n\t" - /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t" - /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t" - "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t" - "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t" - "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t" - "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t" - "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t" - "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t" - "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t" -#endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" - - /* Enter guest mode */ - "call vmx_vmenter\n\t" - - /* Save guest's RCX to the stack placeholder (see above) */ - "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" - - /* Load host's RCX, i.e. the vmx_vcpu pointer */ - "pop %%" _ASM_CX " \n\t" - - /* Set vmx->fail based on EFLAGS.{CF,ZF} */ - "setbe %c[fail](%%" _ASM_CX ")\n\t" - - /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t" - "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t" - "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t" - "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t" - "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t" - "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" - "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" - "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" - /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t" - - "xor %%eax, %%eax \n\t" - "xor %%ebx, %%ebx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - : ASM_CALL_CONSTRAINT - : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), -#endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), - [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "eax", "ebx", "edi" -#endif - ); + if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { + vmx->loaded_vmcs->host_state.rsp = host_rsp; + vmcs_writel(HOST_RSP, host_rsp); + } } -STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); + +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { @@ -6572,7 +6442,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - __vmx_vcpu_run(vcpu, vmx); + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); + + vcpu->arch.cr2 = read_cr2(); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -6657,7 +6536,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) static struct kvm *vmx_vm_alloc(void) { - struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); + struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_vmx->kvm; } @@ -6673,7 +6554,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); - leave_guest_mode(vcpu); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); @@ -6685,14 +6565,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + struct vcpu_vmx *vmx; unsigned long *msr_bitmap; int cpu; + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!vmx->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -6714,12 +6596,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) > PAGE_SIZE); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 0ac0a64c7790..1554cb45b393 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -175,7 +175,6 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; - unsigned long host_rsp; u8 fail; u8 msr_bitmap_mode; u32 exit_intr_info; @@ -209,7 +208,7 @@ struct vcpu_vmx { struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { struct vmx_msrs guest; struct vmx_msrs host; @@ -339,8 +338,8 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) static inline void pi_set_sn(struct pi_desc *pi_desc) { - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); } static inline void pi_set_on(struct pi_desc *pi_desc) @@ -445,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmentry_ctrl & ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); @@ -455,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmexit_ctrl & + return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } @@ -478,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -487,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id(), + GFP_KERNEL_ACCOUNT); } u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 941f932373d0..65e4559eef2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.lapic) @@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) break; @@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xcrs) break; @@ -7055,6 +7056,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) { + if (!lapic_in_kernel(vcpu)) { + WARN_ON_ONCE(vcpu->arch.apicv_active); + return; + } + if (!vcpu->arch.apicv_active) + return; + vcpu->arch.apicv_active = false; kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } @@ -9005,7 +9013,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9026,6 +9033,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; @@ -9033,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) static_key_slow_inc(&kvm_no_apic_vcpu); vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) { r = -ENOMEM; goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) { r = -ENOMEM; goto fail_free_mce_banks; } @@ -9104,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9299,13 +9307,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->arch.rmap[i] = kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL); + linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -9348,13 +9356,13 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm, slots); + kvm_mmu_invalidate_mmio_sptes(kvm, gen); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -9462,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_all(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 224cd0a47568..28406aa1136d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la, static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { + u64 gen = kvm_memslots(vcpu->kvm)->generation; + + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) + return; + /* * If this is a shadow nested page table, the "GVA" is * actually a nGPA. @@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; - vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; + vcpu->arch.mmio_gen = gen; } static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) |