diff options
Diffstat (limited to 'arch/arm64/kvm/hyp')
43 files changed, 4082 insertions, 1145 deletions
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 9f4e8d68ab50..11a10d8f5beb 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -126,7 +126,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) add x1, x1, #VCPU_CONTEXT - ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN) // Store the guest regs x2 and x3 stp x2, x3, [x1, #CPU_XREG_OFFSET(2)] diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 424a5107cddb..bef40ddb16db 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -22,42 +22,36 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { - u64 val; - - if (unlikely(vcpu_has_nv(vcpu))) + if (has_vhe()) return vcpu_read_sys_reg(vcpu, reg); - else if (__vcpu_read_sys_reg_from_cpu(reg, &val)) - return val; return __vcpu_sys_reg(vcpu, reg); } static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (unlikely(vcpu_has_nv(vcpu))) + if (has_vhe()) vcpu_write_sys_reg(vcpu, val, reg); - else if (!__vcpu_write_sys_reg_to_cpu(val, reg)) - __vcpu_sys_reg(vcpu, reg) = val; + else + __vcpu_assign_sys_reg(vcpu, reg, val); } static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, u64 val) { - if (unlikely(vcpu_has_nv(vcpu))) { + if (has_vhe()) { if (target_mode == PSR_MODE_EL1h) vcpu_write_sys_reg(vcpu, val, SPSR_EL1); else vcpu_write_sys_reg(vcpu, val, SPSR_EL2); - } else if (has_vhe()) { - write_sysreg_el1(val, SYS_SPSR); } else { - __vcpu_sys_reg(vcpu, SPSR_EL1) = val; + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val); } } static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) { - if (has_vhe()) + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) write_sysreg(val, spsr_abt); else vcpu->arch.ctxt.spsr_abt = val; @@ -65,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val) static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val) { - if (has_vhe()) + if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) write_sysreg(val, spsr_und); else vcpu->arch.ctxt.spsr_und = val; @@ -339,6 +333,10 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_serror); + break; + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync); break; @@ -347,9 +345,13 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq); break; + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): + enter_exception64(vcpu, PSR_MODE_EL2h, except_type_serror); + break; + default: /* - * Only EL1_SYNC and EL2_{SYNC,IRQ} makes + * Only EL1_{SYNC,SERR} and EL2_{SYNC,IRQ,SERR} makes * sense so far. Everything else gets silently * ignored. */ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 17df94570f03..fc573fc767b0 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -12,6 +12,16 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +static inline bool __fault_safe_to_translate(u64 esr) +{ + u64 fsc = esr & ESR_ELx_FSC; + + if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr)) + return false; + + return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV)); +} + static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { int ret; @@ -44,34 +54,50 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) return true; } -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +/* + * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR. + */ +static inline bool __hpfar_valid(u64 esr) { - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 + * CPUs affected by ARM erratum #834220 may incorrectly report a + * stage-2 translation fault when a stage-1 permission fault occurs. * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. + * Re-walk the page tables to determine if a stage-1 fault actually + * occurred. */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - esr_fsc_is_permission_fault(esr))) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { + if (cpus_have_final_cap(ARM64_WORKAROUND_834220) && + esr_fsc_is_translation_fault(esr)) + return false; + + if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr)) + return true; + + if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr)) + return true; + + return esr_fsc_is_addr_sz_fault(esr); +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar; + + fault->far_el2 = read_sysreg_el2(SYS_FAR); + fault->hpfar_el2 = 0; + + if (__hpfar_valid(esr)) hpfar = read_sysreg(hpfar_el2); - } + else if (unlikely(!__fault_safe_to_translate(esr))) + return true; + else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar)) + return false; - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; + /* + * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid + * HPFAR value. + */ + fault->hpfar_el2 = hpfar | HPFAR_EL2_NS; return true; } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 23bbe28eaaf9..320cd45d49c5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -45,7 +45,7 @@ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) if (!vcpu_el1_is_32bit(vcpu)) return; - __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2); + __vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2)); } static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) @@ -59,85 +59,139 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to * it will cause an exception. */ - if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) { + if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) write_sysreg(1 << 30, fpexc32_el2); - isb(); +} + +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); } -#define compute_clr_set(vcpu, reg, clr, set) \ - do { \ - u64 hfg; \ - hfg = __vcpu_sys_reg(vcpu, reg) & ~__ ## reg ## _RES0; \ - set |= hfg & __ ## reg ## _MASK; \ - clr |= ~hfg & __ ## reg ## _nMASK; \ - } while(0) +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); -#define reg_to_fgt_group_id(reg) \ - ({ \ - enum fgt_group_id id; \ - switch(reg) { \ - case HFGRTR_EL2: \ - case HFGWTR_EL2: \ - id = HFGxTR_GROUP; \ - break; \ - case HFGITR_EL2: \ - id = HFGITR_GROUP; \ - break; \ - case HDFGRTR_EL2: \ - case HDFGWTR_EL2: \ - id = HDFGRTR_GROUP; \ - break; \ - case HAFGRTR_EL2: \ - id = HAFGRTR_GROUP; \ - break; \ - default: \ - BUILD_BUG_ON(1); \ - } \ - \ - id; \ - }) - -#define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ - do { \ - u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ - set |= hfg & __ ## reg ## _MASK; \ - clr |= hfg & __ ## reg ## _nMASK; \ - } while(0) + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} -#define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \ - do { \ - u64 c = 0, s = 0; \ - \ - ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ - if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \ - compute_clr_set(vcpu, reg, c, s); \ - \ - compute_undef_clr_set(vcpu, kvm, reg, c, s); \ - \ - s |= set; \ - c |= clr; \ - if (c || s) { \ - u64 val = __ ## reg ## _nMASK; \ - val |= s; \ - val &= ~c; \ - write_sysreg_s(val, SYS_ ## reg); \ - } \ - } while(0) +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; -#define update_fgt_traps(hctxt, vcpu, kvm, reg) \ - update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0) + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; -/* - * Validate the fine grain trap masks. - * Check that the masks do not overlap and that all bits are accounted for. - */ -#define CHECK_FGT_MASKS(reg) \ - do { \ - BUILD_BUG_ON((__ ## reg ## _MASK) & (__ ## reg ## _nMASK)); \ - BUILD_BUG_ON(~((__ ## reg ## _RES0) ^ (__ ## reg ## _MASK) ^ \ - (__ ## reg ## _nMASK))); \ - } while(0) + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} static inline bool cpu_has_amu(void) { @@ -147,66 +201,99 @@ static inline bool cpu_has_amu(void) ID_AA64PFR0_EL1_AMU_SHIFT); } +#define __activate_fgt(hctxt, vcpu, reg) \ + do { \ + ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ + write_sysreg_s(*vcpu_fgt(vcpu, reg), SYS_ ## reg); \ + } while (0) + static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - - CHECK_FGT_MASKS(HFGRTR_EL2); - CHECK_FGT_MASKS(HFGWTR_EL2); - CHECK_FGT_MASKS(HFGITR_EL2); - CHECK_FGT_MASKS(HDFGRTR_EL2); - CHECK_FGT_MASKS(HDFGWTR_EL2); - CHECK_FGT_MASKS(HAFGRTR_EL2); - CHECK_FGT_MASKS(HCRX_EL2); if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2); - update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0, - cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ? - HFGxTR_EL2_TCR_EL1_MASK : 0); - update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2); - update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2); + __activate_fgt(hctxt, vcpu, HFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR_EL2); + __activate_fgt(hctxt, vcpu, HFGITR_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR_EL2); if (cpu_has_amu()) - update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2); + __activate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __activate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __activate_fgt(hctxt, vcpu, HFGITR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __activate_fgt(hctxt, vcpu, HDFGWTR2_EL2); } -#define __deactivate_fgt(htcxt, vcpu, kvm, reg) \ +static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + __activate_fgt(hctxt, vcpu, ICH_HFGRTR_EL2); + __activate_fgt(hctxt, vcpu, ICH_HFGWTR_EL2); + __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); +} + +#define __deactivate_fgt(hctxt, vcpu, reg) \ do { \ - if ((vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) || \ - kvm->arch.fgu[reg_to_fgt_group_id(reg)]) \ - write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ - SYS_ ## reg); \ + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ + SYS_ ## reg); \ } while(0) static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - struct kvm *kvm = kern_hyp_va(vcpu->kvm); if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - __deactivate_fgt(hctxt, vcpu, kvm, HFGRTR_EL2); - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) - write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); - else - __deactivate_fgt(hctxt, vcpu, kvm, HFGWTR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HFGITR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HDFGRTR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HDFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2); if (cpu_has_amu()) - __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2); +} + +static inline void __deactivate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + __deactivate_fgt(hctxt, vcpu, ICH_HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, ICH_HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); + } static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { - u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + u64 clr = MPAM2_EL2_EnMPAMSM; + u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; if (!system_supports_mpam()) return; @@ -216,18 +303,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); } else { /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ - r |= MPAM2_EL2_TIDR; + set |= MPAM2_EL2_TIDR; } - write_sysreg_s(r, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); } static inline void __deactivate_traps_mpam(void) { + u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR; + u64 set = MPAM2_EL2_EnMPAMSM; + if (!system_supports_mpam()) return; - write_sysreg_s(0, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); if (system_supports_mpam_hcr()) write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); @@ -235,6 +325,8 @@ static inline void __deactivate_traps_mpam(void) static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); @@ -244,55 +336,46 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - if (kvm_arm_support_pmu_v3()) { - struct kvm_cpu_context *hctxt; - + if (system_supports_pmuv3()) { write_sysreg(0, pmselr_el0); - hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } - *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (cpus_have_final_cap(ARM64_HAS_HCX)) { u64 hcrx = vcpu->arch.hcrx_el2; - if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { - u64 clr = 0, set = 0; - - compute_clr_set(vcpu, HCRX_EL2, clr, set); - - hcrx |= set; - hcrx &= ~clr; + if (is_nested_ctxt(vcpu)) { + u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2); + hcrx |= val & __HCRX_EL2_MASK; + hcrx &= ~(~val & __HCRX_EL2_nMASK); } + ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); write_sysreg_s(hcrx, SYS_HCRX_EL2); } __activate_traps_hfgxtr(vcpu); + __activate_traps_ich_hfgxtr(vcpu); __activate_traps_mpam(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { - write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) { - struct kvm_cpu_context *hctxt; - - hctxt = host_data_ptr(host_ctxt); + if (system_supports_pmuv3()) { write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } if (cpus_have_final_cap(ARM64_HAS_HCX)) - write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_ich_hfgxtr(vcpu); __deactivate_traps_mpam(); } @@ -301,23 +384,58 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) hcr |= HCR_TVM; - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) { + u64 vsesr; + + /* + * When HCR_EL2.AMO is set, physical SErrors are taken to EL2 + * and vSError injection is enabled for EL1. Conveniently, for + * NV this means that it is never the case where a 'physical' + * SError (injected by KVM or userspace) and vSError are + * deliverable to the same context. + * + * As such, we can trivially select between the host or guest's + * VSESR_EL2. Except for the case that FEAT_RAS hasn't been + * exposed to the guest, where ESR propagation in hardware + * occurs unconditionally. + * + * Paper over the architectural wart and use an IMPLEMENTATION + * DEFINED ESR value in case FEAT_RAS is hidden from the guest. + */ + if (!vserror_state_is_nested(vcpu)) + vsesr = vcpu->arch.vsesr_el2; + else if (kvm_has_ras(kern_hyp_va(vcpu->kvm))) + vsesr = __vcpu_sys_reg(vcpu, VSESR_EL2); + else + vsesr = ESR_ELx_ISV; - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) - write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); + write_sysreg_s(vsesr, SYS_VSESR_EL2); + } } static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) { + u64 *hcr; + + if (vserror_state_is_nested(vcpu)) + hcr = __ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2); + else + hcr = &vcpu->arch.hcr_el2; + /* * If we pended a virtual abort, preserve it until it gets * cleared. See D1.14.3 (Virtual Interrupts) for details, but * the crucial bit is "On taking a vSError interrupt, * HCR_EL2.VSE is cleared to 0." + * + * Additionally, when in a nested context we need to propagate the + * updated state to the guest hypervisor's HCR_EL2. */ - if (vcpu->arch.hcr_el2 & HCR_VSE) { - vcpu->arch.hcr_el2 &= ~HCR_VSE; - vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE; + if (*hcr & HCR_VSE) { + *hcr &= ~HCR_VSE; + *hcr |= read_sysreg(hcr_el2) & HCR_VSE; } } @@ -344,11 +462,13 @@ static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { + u64 zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + /* * The vCPU's saved SVE state layout always matches the max VL of the * vCPU. Start off with the max VL so we can load the SVE state. */ - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + sve_cond_update_zcr_vq(zcr_el2, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); @@ -358,8 +478,10 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) * nested guest, as the guest hypervisor could select a smaller VL. Slap * that into hardware before wrapping up. */ - if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) - sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + if (is_nested_ctxt(vcpu)) { + zcr_el2 = min(zcr_el2, __vcpu_sys_reg(vcpu, ZCR_EL2)); + sve_cond_update_zcr_vq(zcr_el2, SYS_ZCR_EL2); + } write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); } @@ -383,11 +505,11 @@ static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) return; if (vcpu_has_sve(vcpu)) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + /* A guest hypervisor may restrict the effective max VL. */ - if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) - zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); - else - zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + if (is_nested_ctxt(vcpu)) + zcr_el2 = min(zcr_el2, __vcpu_sys_reg(vcpu, ZCR_EL2)); write_sysreg_el2(zcr_el2, SYS_ZCR); @@ -406,14 +528,14 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) /* * When the guest owns the FP regs, we know that guest+hyp traps for * any FPSIMD/SVE/SME features exposed to the guest have been disabled - * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * by either __activate_cptr_traps() or kvm_hyp_handle_fpsimd() * prior to __guest_entry(). As __guest_entry() guarantees a context * synchronization event, we don't need an ISB here to avoid taking * traps for anything that was exposed to the guest. */ if (vcpu_has_sve(vcpu)) { zcr_el1 = read_sysreg_el1(SYS_ZCR); - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + __vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1); /* * The guest's state is always saved using the guest's max VL. @@ -443,11 +565,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) */ if (system_supports_sve()) { __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - } else { __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); } @@ -498,10 +615,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); - else - cpacr_clear_set(0, CPACR_EL1_FPEN); + __deactivate_cptr_traps(vcpu); isb(); /* Write out the host state if it's in the registers */ @@ -523,6 +637,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); + return true; } @@ -766,7 +887,7 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, return false; } -static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu) { /* * Check for the conditions of Cortex-A510's #2077057. When these occur diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 76ff095c6b6e..a17cbe7582de 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -43,6 +43,17 @@ static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) return &ctxt_sys_reg(ctxt, MDSCR_EL1); } +static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); + + if (!(ctxt_is_guest(ctxt) && + test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) + return read_cpuid_id(); + + return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); +} + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); @@ -98,6 +109,28 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); } +static inline bool ctxt_has_ras(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_ras(kern_hyp_va(vcpu->kvm)); +} + +static inline bool ctxt_has_sctlr2(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_SCTLR2)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_sctlr2(kern_hyp_va(vcpu->kvm)); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -136,6 +169,9 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1); ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR); ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR); + + if (ctxt_has_sctlr2(ctxt)) + ctxt_sys_reg(ctxt, SCTLR2_EL1) = read_sysreg_el1(SYS_SCTLR2); } static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) @@ -148,8 +184,13 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) if (!has_vhe() && ctxt->__hyp_running_vcpu) ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return; + + if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); + else if (ctxt_has_ras(ctxt)) + ctxt_sys_reg(ctxt, VDISR_EL2) = read_sysreg_s(SYS_VDISR_EL2); } static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) @@ -168,8 +209,9 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) } static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, - u64 mpidr) + u64 midr, u64 mpidr) { + write_sysreg(midr, vpidr_el2); write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || @@ -240,6 +282,9 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, write_sysreg(ctxt_sys_reg(ctxt, SP_EL1), sp_el1); write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1), SYS_ELR); write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR); + + if (ctxt_has_sctlr2(ctxt)) + write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR2_EL1), SYS_SCTLR2); } /* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */ @@ -263,6 +308,7 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx { u64 pstate = to_hw_pstate(ctxt); u64 mode = pstate & PSR_AA32_MODE_MASK; + u64 vdisr; /* * Safety check to ensure we're setting the CPU up to enter the guest @@ -281,8 +327,17 @@ static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctx write_sysreg_el2(ctxt->regs.pc, SYS_ELR); write_sysreg_el2(pstate, SYS_SPSR); - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) - write_sysreg_s(ctxt_sys_reg(ctxt, DISR_EL1), SYS_VDISR_EL2); + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return; + + if (!vserror_state_is_nested(ctxt_to_vcpu(ctxt))) + vdisr = ctxt_sys_reg(ctxt, DISR_EL1); + else if (ctxt_has_ras(ctxt)) + vdisr = ctxt_sys_reg(ctxt, VDISR_EL2); + else + vdisr = 0; + + write_sysreg_s(vdisr, SYS_VDISR_EL2); } static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) @@ -295,11 +350,11 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq); vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq); - __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); - __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); + __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2)); + __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2)); if (has_vhe() || kvm_debug_regs_in_use(vcpu)) - __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); + __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2)); } static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h new file mode 100644 index 000000000000..1258bc84477f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ +#define __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ + +#include <asm/kvm_hypevents.h> + +#include <linux/arm-smccc.h> + +#define hyp_smccc_1_1_smc(...) \ + do { \ + trace_hyp_exit(NULL, HYP_REASON_SMC); \ + arm_smccc_1_1_smc(__VA_ARGS__); \ + trace_hyp_enter(NULL, HYP_REASON_SMC); \ + } while (0) + +#define hyp_smccc_1_2_smc(...) \ + do { \ + trace_hyp_exit(NULL, HYP_REASON_SMC); \ + arm_smccc_1_2_smc(__VA_ARGS__); \ + trace_hyp_enter(NULL, HYP_REASON_SMC); \ + } while (0) + +#endif /* __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/clock.h b/arch/arm64/kvm/hyp/include/nvhe/clock.h new file mode 100644 index 000000000000..9f429f5c0664 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/clock.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_KVM_HYP_NVHE_CLOCK_H +#define __ARM64_KVM_HYP_NVHE_CLOCK_H +#include <linux/types.h> + +#include <asm/kvm_hyp.h> + +#ifdef CONFIG_NVHE_EL2_TRACING +void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc); +u64 trace_clock(void); +#else +static inline void +trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { } +static inline u64 trace_clock(void) { return 0; } +#endif +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/define_events.h b/arch/arm64/kvm/hyp/include/nvhe/define_events.h new file mode 100644 index 000000000000..776d4c6cb702 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/define_events.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef HYP_EVENT +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + struct hyp_event_id hyp_event_id_##__name \ + __section(".hyp.event_ids."#__name) = { \ + .enabled = ATOMIC_INIT(0), \ + } + +#define HYP_EVENT_MULTI_READ +#include <asm/kvm_hypevents.h> +#undef HYP_EVENT_MULTI_READ + +#undef HYP_EVENT diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 978f38c386ee..3cbfae0e3dda 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -27,24 +27,28 @@ extern struct host_mmu host_mmu; enum pkvm_component_id { PKVM_ID_HOST, PKVM_ID_HYP, - PKVM_ID_FFA, + PKVM_ID_GUEST, }; -extern unsigned long hyp_nr_cpus; - int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); +int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn); +int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn); int __pkvm_host_unshare_hyp(u64 pfn); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); -int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, +int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu); +int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu); +int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys); +int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); -int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm); int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); @@ -56,7 +60,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); @@ -67,4 +71,12 @@ static __always_inline void __load_host_stage2(void) else write_sysreg(0, vttbr_el2); } + +#ifdef CONFIG_NVHE_EL2_DEBUG +void pkvm_ownership_selftest(void *base); +struct pkvm_hyp_vcpu *init_selftest_vm(void *virt); +void teardown_selftest_vm(void); +#else +static inline void pkvm_ownership_selftest(void *base) { } +#endif #endif /* __KVM_NVHE_MEM_PROTECT__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 34233d586060..b50712d47f6d 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -8,23 +8,36 @@ #include <linux/types.h> /* - * Bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). + * Bits 0-1 are used to encode the memory ownership state of each page from the + * point of view of a pKVM "component" (host, hyp, guest, ... see enum + * pkvm_component_id): + * 00: The page is owned and exclusively accessible by the component; + * 01: The page is owned and accessible by the component, but is also + * accessible by another component; + * 10: The page is accessible but not owned by the component; + * The storage of this state depends on the component: either in the + * hyp_vmemmap for the host and hyp states or in PTE software bits for guests. */ enum pkvm_page_state { PKVM_PAGE_OWNED = 0ULL, PKVM_PAGE_SHARED_OWNED = BIT(0), PKVM_PAGE_SHARED_BORROWED = BIT(1), - __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE = BIT(2), + /* + * 'Meta-states' are not stored directly in PTE SW bits for guest + * states, but inferred from the context (e.g. invalid PTE entries). + * For the host and hyp, meta-states are stored directly in the + * struct hyp_page. + */ + PKVM_NOPAGE = BIT(0) | BIT(1), + + /* + * 'Meta-states' which aren't encoded directly in the PTE's SW bits (or + * the hyp_vmemmap entry for the host) + */ + PKVM_POISON = BIT(2), }; -#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) +#define PKVM_PAGE_STATE_VMEMMAP_MASK (BIT(0) | BIT(1)) #define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, @@ -44,8 +57,15 @@ struct hyp_page { u16 refcount; u8 order; - /* Host (non-meta) state. Guarded by the host stage-2 lock. */ - enum pkvm_page_state host_state : 8; + /* Host state. Guarded by the host stage-2 lock. */ + unsigned __host_state : 4; + + /* + * Complement of the hyp state. Guarded by the hyp stage-1 lock. We use + * the complement so that the initial 0 in __hyp_state_comp (due to the + * entire vmemmap starting off zeroed) encodes PKVM_NOPAGE. + */ + unsigned __hyp_state_comp : 4; u32 host_share_guest_count; }; @@ -82,6 +102,26 @@ static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +static inline enum pkvm_page_state get_host_state(struct hyp_page *p) +{ + return p->__host_state; +} + +static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__host_state = state; +} + +static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p) +{ + return p->__hyp_state_comp ^ PKVM_PAGE_STATE_VMEMMAP_MASK; +} + +static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_VMEMMAP_MASK; +} + /* * Refcounting for 'struct hyp_page'. * hyp_pool::lock must be held if atomic access to the refcount is required. diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 230e4f2527de..6e83ce35c2f2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -13,9 +13,11 @@ extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; -int hyp_create_pcpu_fixmap(void); +int hyp_create_fixmap(void); void *hyp_fixmap_map(phys_addr_t phys); void hyp_fixmap_unmap(void); +void *hyp_fixblock_map(phys_addr_t phys, size_t *size); +void hyp_fixblock_unmap(void); int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index e42bf68c8848..c904647d2f76 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -29,7 +29,7 @@ struct pkvm_hyp_vcpu { }; /* - * Holds the relevant data for running a protected vm. + * Holds the relevant data for running a vm in protected mode. */ struct pkvm_hyp_vm { struct kvm kvm; @@ -43,12 +43,6 @@ struct pkvm_hyp_vm { struct hyp_pool pool; hyp_spinlock_t lock; - /* - * The number of vcpus initialized and ready to run. - * Modifying this is protected by 'vm_table_lock'. - */ - unsigned int nr_vcpus; - /* Array of the hyp vCPU structures for this VM. */ struct pkvm_hyp_vcpu *vcpus[]; }; @@ -73,12 +67,18 @@ static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) void pkvm_hyp_vm_table_init(void *tbl); +int __pkvm_reserve_vm(void); +void __pkvm_unreserve_vm(pkvm_handle_t handle); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, unsigned long vcpu_hva); -int __pkvm_teardown_vm(pkvm_handle_t handle); +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn); +int __pkvm_start_teardown_vm(pkvm_handle_t handle); +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle); + +struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); @@ -88,6 +88,7 @@ struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); +bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h new file mode 100644 index 000000000000..8813ff250f8e --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H +#define __ARM64_KVM_HYP_NVHE_TRACE_H + +#include <linux/trace_remote_event.h> + +#include <asm/kvm_hyptrace.h> + +static inline pid_t __tracing_get_vcpu_pid(struct kvm_cpu_context *host_ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!host_ctxt) + host_ctxt = host_data_ptr(host_ctxt); + + vcpu = host_ctxt->__hyp_running_vcpu; + + return vcpu ? vcpu->arch.pid : 0; +} + +#define HE_PROTO(__args...) __args +#define HE_ASSIGN(__args...) __args +#define HE_STRUCT RE_STRUCT +#define he_field re_field + +#ifdef CONFIG_NVHE_EL2_TRACING + +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + REMOTE_EVENT_FORMAT(__name, __struct); \ + extern struct hyp_event_id hyp_event_id_##__name; \ + static __always_inline void trace_##__name(__proto) \ + { \ + struct remote_event_format_##__name *__entry; \ + size_t length = sizeof(*__entry); \ + \ + if (!atomic_read(&hyp_event_id_##__name.enabled)) \ + return; \ + __entry = tracing_reserve_entry(length); \ + if (!__entry) \ + return; \ + __entry->hdr.id = hyp_event_id_##__name.id; \ + __assign \ + tracing_commit_entry(); \ + } + +void *tracing_reserve_entry(unsigned long length); +void tracing_commit_entry(void); + +int __tracing_load(unsigned long desc_va, size_t desc_size); +void __tracing_unload(void); +int __tracing_enable(bool enable); +int __tracing_swap_reader(unsigned int cpu); +void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc); +int __tracing_reset(unsigned int cpu); +int __tracing_enable_event(unsigned short id, bool enable); +#else +static inline void *tracing_reserve_entry(unsigned long length) { return NULL; } +static inline void tracing_commit_entry(void) { } +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + static inline void trace_##__name(__proto) {} + +static inline int __tracing_load(unsigned long desc_va, size_t desc_size) { return -ENODEV; } +static inline void __tracing_unload(void) { } +static inline int __tracing_enable(bool enable) { return -ENODEV; } +static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; } +static inline void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { } +static inline int __tracing_reset(unsigned int cpu) { return -ENODEV; } +static inline int __tracing_enable_event(unsigned short id, bool enable) { return -ENODEV; } +#endif +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 1e6d995968a1..32d7b7746e8e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -12,7 +12,10 @@ #include <asm/kvm_host.h> #define cpu_reg(ctxt, r) (ctxt)->regs.regs[r] -#define DECLARE_REG(type, name, ctxt, reg) \ +#define DECLARE_REG(type, name, ctxt, reg) \ + __always_unused int ___check_reg_ ## reg; \ type name = (type)cpu_reg(ctxt, (reg)) +void inject_host_exception(u64 esr); + #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index b43426a493df..62cdfbff7562 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -12,12 +12,12 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ ccflags-y += -fno-stack-protector \ -DDISABLE_BRANCH_PROFILING \ - $(DISABLE_STACKLEAK_PLUGIN) + $(DISABLE_KSTACK_ERASE) hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include -lib-objs := clear_page.o copy_page.o memcpy.o memset.o +lib-objs := clear_page.o copy_page.o memcpy.o memset.o tishift.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) CFLAGS_switch.nvhe.o += -Wno-override-init @@ -26,10 +26,15 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o + ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o ../vgic-v5-sr.o +hyp-obj-y += ../../../kernel/smccc-call.o hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o +hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o events.o hyp-obj-y += $(lib-objs) +# Path to simple_ring_buffer.c +CFLAGS_trace.nvhe.o += -I$(srctree)/kernel/trace/ + ## ## Build rules for compiling nVHE hyp code ## Output of this folder is `kvm_nvhe.o`, a partially linked object @@ -99,3 +104,9 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAG # causes a build failure. Remove profile optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + +ifeq ($(CONFIG_UBSAN_KVM_EL2),y) +UBSAN_SANITIZE := y +# Always use brk and not hooks +ccflags-y += $(CFLAGS_UBSAN_TRAP) +endif diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c new file mode 100644 index 000000000000..a7fc61976fd0 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/clock.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <nvhe/clock.h> + +#include <asm/arch_timer.h> +#include <asm/div64.h> + +static struct clock_data { + struct { + u32 mult; + u32 shift; + u64 epoch_ns; + u64 epoch_cyc; + u64 cyc_overflow64; + } data[2]; + u64 cur; +} trace_clock_data; + +static u64 __clock_mult_uint128(u64 cyc, u32 mult, u32 shift) +{ + __uint128_t ns = (__uint128_t)cyc * mult; + + ns >>= shift; + + return (u64)ns; +} + +/* Does not guarantee no reader on the modified bank. */ +void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) +{ + struct clock_data *clock = &trace_clock_data; + u64 bank = clock->cur ^ 1; + + if (!mult || shift >= 64) + return; + + clock->data[bank].mult = mult; + clock->data[bank].shift = shift; + clock->data[bank].epoch_ns = epoch_ns; + clock->data[bank].epoch_cyc = epoch_cyc; + clock->data[bank].cyc_overflow64 = ULONG_MAX / mult; + + smp_store_release(&clock->cur, bank); +} + +/* Use untrusted host data */ +u64 trace_clock(void) +{ + struct clock_data *clock = &trace_clock_data; + u64 bank = smp_load_acquire(&clock->cur); + u64 cyc, ns; + + cyc = __arch_counter_get_cntvct() - clock->data[bank].epoch_cyc; + + if (likely(cyc < clock->data[bank].cyc_overflow64)) { + ns = cyc * clock->data[bank].mult; + ns >>= clock->data[bank].shift; + } else { + ns = __clock_mult_uint128(cyc, clock->data[bank].mult, + clock->data[bank].shift); + } + + return (u64)ns + clock->data[bank].epoch_ns; +} diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 2f4a4f5036bb..f8904391c125 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -14,20 +14,20 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> -static void __debug_save_spe(u64 *pmscr_el1) +static void __debug_save_spe(void) { - u64 reg; + u64 *pmscr_el1, *pmblimitr_el1; - /* Clear pmscr in case of early return */ - *pmscr_el1 = 0; + pmscr_el1 = host_data_ptr(host_debug_state.pmscr_el1); + pmblimitr_el1 = host_data_ptr(host_debug_state.pmblimitr_el1); /* * At this point, we know that this CPU implements * SPE and is available to the host. * Check if the host is actually using it ? */ - reg = read_sysreg_s(SYS_PMBLIMITR_EL1); - if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT))) + *pmblimitr_el1 = read_sysreg_s(SYS_PMBLIMITR_EL1); + if (!(*pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT))) return; /* Yes; save the control register and disable data generation */ @@ -37,18 +37,29 @@ static void __debug_save_spe(u64 *pmscr_el1) /* Now drain all buffered data to memory */ psb_csync(); + dsb(nsh); + + /* And disable the profiling buffer */ + write_sysreg_s(0, SYS_PMBLIMITR_EL1); + isb(); } -static void __debug_restore_spe(u64 pmscr_el1) +static void __debug_restore_spe(void) { - if (!pmscr_el1) + u64 pmblimitr_el1 = *host_data_ptr(host_debug_state.pmblimitr_el1); + + if (!(pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT))) return; /* The host page table is installed, but not yet synchronised */ isb(); + /* Re-enable the profiling buffer. */ + write_sysreg_s(pmblimitr_el1, SYS_PMBLIMITR_EL1); + isb(); + /* Re-enable data generation */ - write_sysreg_el1(pmscr_el1, SYS_PMSCR); + write_sysreg_el1(*host_data_ptr(host_debug_state.pmscr_el1), SYS_PMSCR); } static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) @@ -57,12 +68,54 @@ static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) write_sysreg_el1(new_trfcr, SYS_TRFCR); } -static bool __trace_needs_drain(void) +static void __trace_drain_and_disable(void) { - if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) - return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + u64 *trblimitr_el1 = host_data_ptr(host_debug_state.trblimitr_el1); + bool needs_drain = is_protected_kvm_enabled() ? + host_data_test_flag(HAS_TRBE) : + host_data_test_flag(TRBE_ENABLED); - return host_data_test_flag(TRBE_ENABLED); + if (!needs_drain) { + *trblimitr_el1 = 0; + return; + } + + *trblimitr_el1 = read_sysreg_s(SYS_TRBLIMITR_EL1); + if (*trblimitr_el1 & TRBLIMITR_EL1_E) { + /* + * The host has enabled the Trace Buffer Unit so we have + * to beat the CPU with a stick until it stops accessing + * memory. + */ + + /* First, ensure that our prior write to TRFCR has stuck. */ + isb(); + + /* Now synchronise with the trace and drain the buffer. */ + tsb_csync(); + dsb(nsh); + + /* + * With no more trace being generated, we can disable the + * Trace Buffer Unit. + */ + write_sysreg_s(0, SYS_TRBLIMITR_EL1); + if (cpus_have_final_cap(ARM64_WORKAROUND_2064142)) { + /* + * Some CPUs are so good, we have to drain 'em + * twice. + */ + tsb_csync(); + dsb(nsh); + } + + /* + * Ensure that the Trace Buffer Unit is disabled before + * we start mucking with the stage-2 and trap + * configuration. + */ + isb(); + } } static bool __trace_needs_switch(void) @@ -79,24 +132,69 @@ static void __trace_switch_to_guest(void) __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), *host_data_ptr(trfcr_while_in_guest)); - - if (__trace_needs_drain()) { - isb(); - tsb_csync(); - } + __trace_drain_and_disable(); } static void __trace_switch_to_host(void) { + u64 trblimitr_el1 = *host_data_ptr(host_debug_state.trblimitr_el1); + + if (trblimitr_el1 & TRBLIMITR_EL1_E) { + /* Re-enable the Trace Buffer Unit for the host. */ + write_sysreg_s(trblimitr_el1, SYS_TRBLIMITR_EL1); + isb(); + if (cpus_have_final_cap(ARM64_WORKAROUND_2038923)) { + /* + * Make sure the unit is re-enabled before we + * poke TRFCR. + */ + isb(); + } + } + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), *host_data_ptr(host_debug_state.trfcr_el1)); } +static void __debug_save_brbe(void) +{ + u64 *brbcr_el1 = host_data_ptr(host_debug_state.brbcr_el1); + + *brbcr_el1 = 0; + + /* Check if the BRBE is enabled */ + if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE))) + return; + + /* + * Prohibit branch record generation while we are in guest. + * Since access to BRBCR_EL1 is trapped, the guest can't + * modify the filtering set by the host. + */ + *brbcr_el1 = read_sysreg_el1(SYS_BRBCR); + write_sysreg_el1(0, SYS_BRBCR); +} + +static void __debug_restore_brbe(void) +{ + u64 brbcr_el1 = *host_data_ptr(host_debug_state.brbcr_el1); + + if (!brbcr_el1) + return; + + /* Restore BRBE controls */ + write_sysreg_el1(brbcr_el1, SYS_BRBCR); +} + void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ if (host_data_test_flag(HAS_SPE)) - __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); + __debug_save_spe(); + + /* Disable BRBE branch records */ + if (host_data_test_flag(HAS_BRBE)) + __debug_save_brbe(); if (__trace_needs_switch()) __trace_switch_to_guest(); @@ -110,7 +208,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { if (host_data_test_flag(HAS_SPE)) - __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); + __debug_restore_spe(); + if (host_data_test_flag(HAS_BRBE)) + __debug_restore_brbe(); if (__trace_needs_switch()) __trace_switch_to_host(); } diff --git a/arch/arm64/kvm/hyp/nvhe/events.c b/arch/arm64/kvm/hyp/nvhe/events.c new file mode 100644 index 000000000000..add9383aadb5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/events.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <nvhe/mm.h> +#include <nvhe/trace.h> + +#include <nvhe/define_events.h> + +int __tracing_enable_event(unsigned short id, bool enable) +{ + struct hyp_event_id *event_id = &__hyp_event_ids_start[id]; + atomic_t *enabled; + + if (event_id >= __hyp_event_ids_end) + return -EINVAL; + + enabled = hyp_fixmap_map(__hyp_pa(&event_id->enabled)); + atomic_set(enabled, enable); + hyp_fixmap_unmap(); + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index e433dfab882a..1af722771178 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -26,10 +26,10 @@ * the duration and are therefore serialised. */ -#include <linux/arm-smccc.h> #include <linux/arm_ffa.h> #include <asm/kvm_pkvm.h> +#include <nvhe/arm-smccc.h> #include <nvhe/ffa.h> #include <nvhe/mem_protect.h> #include <nvhe/memory.h> @@ -71,36 +71,68 @@ static u32 hyp_ffa_version; static bool has_version_negotiated; static hyp_spinlock_t version_lock; -static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) +static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno) { - *res = (struct arm_smccc_res) { + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_ERROR, .a2 = ffa_errno, }; } -static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) +static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop) { if (ret == FFA_RET_SUCCESS) { - *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, - .a2 = prop }; + *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS, + .a2 = prop }; } else { ffa_to_smccc_error(res, ret); } } -static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret) { ffa_to_smccc_res_prop(res, ret, 0); } static void ffa_set_retval(struct kvm_cpu_context *ctxt, - struct arm_smccc_res *res) + struct arm_smccc_1_2_regs *res) { cpu_reg(ctxt, 0) = res->a0; cpu_reg(ctxt, 1) = res->a1; cpu_reg(ctxt, 2) = res->a2; cpu_reg(ctxt, 3) = res->a3; + cpu_reg(ctxt, 4) = res->a4; + cpu_reg(ctxt, 5) = res->a5; + cpu_reg(ctxt, 6) = res->a6; + cpu_reg(ctxt, 7) = res->a7; + + /* + * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30. + * + * In FF-A 1.2, we cannot rely on the function ID sent by the caller to + * detect 32-bit calls because the CPU cycle management interfaces (e.g. + * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses. + * + * FFA-1.3 introduces 64-bit variants of the CPU cycle management + * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests + * complete with SMC32 direct responses which *should* allow us use the + * function ID sent by the caller to determine whether to return x8-x17. + * + * Note that we also cannot rely on function IDs in the response. + * + * Given the above, assume SMC64 and send back x0-x17 unconditionally + * as the passthrough code (__kvm_hyp_host_forward_smc) does the same. + */ + cpu_reg(ctxt, 8) = res->a8; + cpu_reg(ctxt, 9) = res->a9; + cpu_reg(ctxt, 10) = res->a10; + cpu_reg(ctxt, 11) = res->a11; + cpu_reg(ctxt, 12) = res->a12; + cpu_reg(ctxt, 13) = res->a13; + cpu_reg(ctxt, 14) = res->a14; + cpu_reg(ctxt, 15) = res->a15; + cpu_reg(ctxt, 16) = res->a16; + cpu_reg(ctxt, 17) = res->a17; } static bool is_ffa_call(u64 func_id) @@ -113,82 +145,92 @@ static bool is_ffa_call(u64 func_id) static int ffa_map_hyp_buffers(u64 ffa_page_count) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, - hyp_virt_to_phys(hyp_buffers.tx), - hyp_virt_to_phys(hyp_buffers.rx), - ffa_page_count, - 0, 0, 0, 0, - &res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_RXTX_MAP, + .a1 = hyp_virt_to_phys(hyp_buffers.tx), + .a2 = hyp_virt_to_phys(hyp_buffers.rx), + .a3 = ffa_page_count, + }, &res); return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } static int ffa_unmap_hyp_buffers(void) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_RXTX_UNMAP, - HOST_FFA_ID, - 0, 0, 0, 0, 0, 0, - &res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RXTX_UNMAP, + .a1 = HOST_FFA_ID, + }, &res); return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } -static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fraglen, u32 endpoint_id) { - arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, - handle_lo, handle_hi, fraglen, endpoint_id, - 0, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_TX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fraglen, + .a4 = endpoint_id, + }, res); } -static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fragoff) { - arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, - handle_lo, handle_hi, fragoff, HOST_FFA_ID, - 0, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_FRAG_RX, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = fragoff, + .a4 = HOST_FFA_ID, + }, res); } -static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, +static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, u32 fraglen) { - arm_smccc_1_1_smc(func_id, len, fraglen, - 0, 0, 0, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = func_id, + .a1 = len, + .a2 = fraglen, + }, res); } -static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, +static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 flags) { - arm_smccc_1_1_smc(FFA_MEM_RECLAIM, - handle_lo, handle_hi, flags, - 0, 0, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_MEM_RECLAIM, + .a1 = handle_lo, + .a2 = handle_hi, + .a3 = flags, + }, res); } -static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) +static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) { - arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, - len, len, - 0, 0, 0, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_FN64_MEM_RETRIEVE_REQ, + .a1 = len, + .a2 = len, + }, res); } -static void ffa_rx_release(struct arm_smccc_res *res) +static void ffa_rx_release(struct arm_smccc_1_2_regs *res) { - arm_smccc_1_1_smc(FFA_RX_RELEASE, - 0, 0, - 0, 0, 0, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_RX_RELEASE, + }, res); } -static void do_ffa_rxtx_map(struct arm_smccc_res *res, +static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(phys_addr_t, tx, ctxt, 1); @@ -267,7 +309,7 @@ err_unmap: goto out_unlock; } -static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, +static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, id, ctxt, 1); @@ -368,7 +410,7 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, return ret; } -static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, +static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, handle_lo, ctxt, 1); @@ -427,7 +469,7 @@ out: } static void __do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, + struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); @@ -437,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id, struct ffa_mem_region_attributes *ep_mem_access; struct ffa_composite_mem_region *reg; struct ffa_mem_region *buf; - u32 offset, nr_ranges; + u32 offset, nr_ranges, checked_offset; int ret = 0; if (addr_mbz || npages_mbz || fraglen > len || @@ -474,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } - if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { + if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < checked_offset) { ret = FFA_RET_INVALID_PARAMETERS; goto out_unlock; } @@ -521,7 +568,7 @@ err_unshare: __do_ffa_mem_xfer((fid), (res), (ctxt)); \ } while (0); -static void do_ffa_mem_reclaim(struct arm_smccc_res *res, +static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, handle_lo, ctxt, 1); @@ -628,13 +675,26 @@ static bool ffa_call_supported(u64 func_id) case FFA_RXTX_MAP: case FFA_MEM_DONATE: case FFA_MEM_RETRIEVE_REQ: + /* Optional notification interfaces added in FF-A 1.1 */ + case FFA_NOTIFICATION_BITMAP_CREATE: + case FFA_NOTIFICATION_BITMAP_DESTROY: + case FFA_NOTIFICATION_BIND: + case FFA_NOTIFICATION_UNBIND: + case FFA_NOTIFICATION_SET: + case FFA_NOTIFICATION_GET: + case FFA_NOTIFICATION_INFO_GET: + /* Optional interfaces added in FF-A 1.2 */ + case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */ + case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */ + case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */ + case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */ return false; } return true; } -static bool do_ffa_features(struct arm_smccc_res *res, +static bool do_ffa_features(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, id, ctxt, 1); @@ -666,21 +726,25 @@ out_handled: static int hyp_ffa_post_init(void) { size_t min_rxtx_sz; - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; - arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_ID_GET, + }, &res); if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; if (res.a2 != HOST_FFA_ID) return -EINVAL; - arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, - 0, 0, 0, 0, 0, 0, &res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + .a0 = FFA_FEATURES, + .a1 = FFA_FN64_RXTX_MAP, + }, &res); if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; - switch (res.a2) { + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { case FFA_FEAT_RXTX_MIN_SZ_4K: min_rxtx_sz = SZ_4K; break; @@ -700,7 +764,7 @@ static int hyp_ffa_post_init(void) return 0; } -static void do_ffa_version(struct arm_smccc_res *res, +static void do_ffa_version(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, ffa_req_version, ctxt, 1); @@ -712,7 +776,10 @@ static void do_ffa_version(struct arm_smccc_res *res, hyp_spin_lock(&version_lock); if (has_version_negotiated) { - res->a0 = hyp_ffa_version; + if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) + res->a0 = FFA_RET_NOT_SUPPORTED; + else + res->a0 = hyp_ffa_version; goto unlock; } @@ -721,26 +788,27 @@ static void do_ffa_version(struct arm_smccc_res *res, * first if TEE supports it. */ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { - arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0, - 0, 0, 0, 0, 0, - res); - if (res->a0 == FFA_RET_NOT_SUPPORTED) + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = ffa_req_version, + }, res); + if ((s32)res->a0 == FFA_RET_NOT_SUPPORTED) goto unlock; hyp_ffa_version = ffa_req_version; } - if (hyp_ffa_post_init()) + if (hyp_ffa_post_init()) { res->a0 = FFA_RET_NOT_SUPPORTED; - else { - has_version_negotiated = true; + } else { + smp_store_release(&has_version_negotiated, true); res->a0 = hyp_ffa_version; } unlock: hyp_spin_unlock(&version_lock); } -static void do_ffa_part_get(struct arm_smccc_res *res, +static void do_ffa_part_get(struct arm_smccc_1_2_regs *res, struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, uuid0, ctxt, 1); @@ -756,9 +824,14 @@ static void do_ffa_part_get(struct arm_smccc_res *res, goto out_unlock; } - arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1, - uuid2, uuid3, flags, 0, 0, - res); + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_PARTITION_INFO_GET, + .a1 = uuid0, + .a2 = uuid1, + .a3 = uuid2, + .a4 = uuid3, + .a5 = flags, + }, res); if (res->a0 != FFA_SUCCESS) goto out_unlock; @@ -791,7 +864,7 @@ out_unlock: bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; /* * There's no way we can tell what a non-standard SMC call might @@ -809,7 +882,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) if (!is_ffa_call(func_id)) return false; - if (!has_version_negotiated && func_id != FFA_VERSION) { + if (func_id != FFA_VERSION && + !smp_load_acquire(&has_version_negotiated)) { ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS); goto out_handled; } @@ -859,14 +933,17 @@ out_handled: int hyp_ffa_init(void *pages) { - struct arm_smccc_res res; + struct arm_smccc_1_2_regs res; void *tx, *rx; if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; - arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res); - if (res.a0 == FFA_RET_NOT_SUPPORTED) + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + .a0 = FFA_VERSION, + .a1 = FFA_VERSION_1_2, + }, &res); + if ((s32)res.a0 == FFA_RET_NOT_SUPPORTED) return 0; /* @@ -885,10 +962,10 @@ int hyp_ffa_init(void *pages) if (FFA_MAJOR_VERSION(res.a0) != 1) return -EOPNOTSUPP; - if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1)) + if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2)) hyp_ffa_version = res.a0; else - hyp_ffa_version = FFA_VERSION_1_1; + hyp_ffa_version = FFA_VERSION_1_2; tx = pages; pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 58f0cb2298cc..9393fe3ea6a1 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -120,12 +120,11 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 -#ifdef CONFIG_NVHE_EL2_DEBUG +#ifdef CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM - msr hcr_el2, x0 - isb + msr_hcr_el2 x0 tlbi vmalls12e1 dsb nsh #endif @@ -291,13 +290,3 @@ SYM_CODE_START(__kvm_hyp_host_forward_smc) ret SYM_CODE_END(__kvm_hyp_host_forward_smc) - -/* - * kvm_host_psci_cpu_entry is called through br instruction, which requires - * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external - * functions, but bti c instead. - */ -SYM_CODE_START(kvm_host_psci_cpu_entry) - bti j - b __kvm_host_psci_cpu_entry -SYM_CODE_END(kvm_host_psci_cpu_entry) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index f8af11189572..89cb553be1e5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -100,7 +100,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) msr mair_el2, x1 ldr x1, [x0, #NVHE_INIT_HCR_EL2] - msr hcr_el2, x1 + msr_hcr_el2 x1 mov x2, #HCR_E2H and x2, x1, x2 @@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) ldr x1, [x0, #NVHE_INIT_PGD_PA] phys_to_ttbr x2, x1 alternative_if ARM64_HAS_CNP - orr x2, x2, #TTBR_CNP_BIT + orr x2, x2, #TTBRx_EL1_CnP alternative_else_nop_endif msr ttbr0_el2, x2 @@ -173,9 +173,8 @@ SYM_CODE_END(___kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START(kvm_hyp_cpu_entry) - mov x1, #1 // is_cpu_on = true + ldr x29, =__kvm_host_psci_cpu_on_entry b __kvm_hyp_init_cpu -SYM_CODE_END(kvm_hyp_cpu_entry) /* * PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point @@ -183,32 +182,17 @@ SYM_CODE_END(kvm_hyp_cpu_entry) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START(kvm_hyp_cpu_resume) - mov x1, #0 // is_cpu_on = false - b __kvm_hyp_init_cpu -SYM_CODE_END(kvm_hyp_cpu_resume) + ldr x29, =__kvm_host_psci_cpu_resume_entry -/* - * Common code for CPU entry points. Initializes EL2 state and - * installs the hypervisor before handing over to a C handler. - * - * x0: struct kvm_nvhe_init_params PA - * x1: bool is_cpu_on - */ -SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) +SYM_INNER_LABEL(__kvm_hyp_init_cpu, SYM_L_LOCAL) mov x28, x0 // Stash arguments - mov x29, x1 /* Check that the core was booted in EL2. */ mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 - b.eq 2f - - /* The core booted in EL1. KVM cannot be initialized on it. */ -1: wfe - wfi - b 1b + b.ne 1f -2: msr SPsel, #1 // We want to use SP_EL{1,2} + msr SPsel, #1 // We want to use SP_EL2 init_el2_hcr 0 @@ -218,11 +202,16 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) mov x0, x28 bl ___kvm_hyp_init // Clobbers x0..x2 - /* Leave idmap. */ - mov x0, x29 - ldr x1, =kvm_host_psci_cpu_entry - br x1 -SYM_CODE_END(__kvm_hyp_init_cpu) + /* Leave idmap -- using BLR is OK, LR is restored from host context */ + blr x29 + + // The core booted in EL1, or the C code unexpectedly returned. + // Either way, KVM cannot be initialized on it. +1: wfe + wfi + b 1b +SYM_CODE_END(kvm_hyp_cpu_resume) +SYM_CODE_END(kvm_hyp_cpu_entry) SYM_CODE_START(__kvm_handle_stub_hvc) /* @@ -260,11 +249,6 @@ reset: msr sctlr_el2, x5 isb -alternative_if ARM64_KVM_PROTECTED_MODE - mov_q x5, HCR_HOST_NVHE_FLAGS - msr hcr_el2, x5 -alternative_else_nop_endif - /* Install stub vectors */ adr_l x5, __hyp_stub_vectors msr vbar_el2, x5 @@ -296,7 +280,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd) /* Install the new pgtables */ phys_to_ttbr x5, x0 alternative_if ARM64_HAS_CNP - orr x5, x5, #TTBR_CNP_BIT + orr x5, x5, #TTBRx_EL1_CnP alternative_else_nop_endif msr ttbr0_el2, x5 diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2c37680d954c..06db299c37a8 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -12,12 +12,14 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_host.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_hypevents.h> #include <asm/kvm_mmu.h> #include <nvhe/ffa.h> #include <nvhe/mem_protect.h> #include <nvhe/mm.h> #include <nvhe/pkvm.h> +#include <nvhe/trace.h> #include <nvhe/trap_handler.h> DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); @@ -26,7 +28,7 @@ void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) { - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR)); /* * On saving/restoring guest sve state, always use the maximum VL for * the guest. The layout of the data when saving the sve state depends @@ -69,7 +71,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ isb(); if (vcpu_has_sve(vcpu)) @@ -79,7 +84,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); if (has_fpmr) - __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR)); if (system_supports_sve()) __hyp_sve_restore_host(); @@ -123,10 +128,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; - hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - /* Limit guest vector length to the maximum supported by the host. */ - hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); - hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & @@ -137,6 +138,8 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; + + hyp_vcpu->vcpu.arch.pid = host_vcpu->arch.pid; } static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) @@ -158,6 +161,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; for (i = 0; i < hyp_cpu_if->used_lrs; ++i) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } @@ -169,9 +173,6 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) DECLARE_REG(u64, hcr_el2, host_ctxt, 3); struct pkvm_hyp_vcpu *hyp_vcpu; - if (!is_protected_kvm_enabled()) - return; - hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); if (!hyp_vcpu) return; @@ -180,17 +181,16 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) /* Propagate WFx trapping flags */ hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } else { + memcpy(&hyp_vcpu->vcpu.arch.fgt, hyp_vcpu->host_vcpu->arch.fgt, + sizeof(hyp_vcpu->vcpu.arch.fgt)); } } static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) { - struct pkvm_hyp_vcpu *hyp_vcpu; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); - if (!is_protected_kvm_enabled()) - return; - - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (hyp_vcpu) pkvm_put_hyp_vcpu(hyp_vcpu); } @@ -245,17 +245,35 @@ static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) &host_vcpu->arch.pkvm_memcache); } -static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_host_donate_guest(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, pfn, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); - DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || !pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) goto out; + ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) goto out; @@ -264,7 +282,7 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) if (ret) goto out; - ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot); + ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot); out: cpu_reg(host_ctxt, 1) = ret; } @@ -273,17 +291,15 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) goto out; - ret = __pkvm_host_unshare_guest(gfn, hyp_vm); + ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm); put_pkvm_hyp_vm(hyp_vm); out: cpu_reg(host_ctxt, 1) = ret; @@ -296,9 +312,6 @@ static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ct struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) goto out; @@ -312,17 +325,15 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) goto out; - ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm); + ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm); put_pkvm_hyp_vm(hyp_vm); out: cpu_reg(host_ctxt, 1) = ret; @@ -332,18 +343,16 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); - DECLARE_REG(bool, mkold, host_ctxt, 3); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(bool, mkold, host_ctxt, 4); struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) goto out; - ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm); + ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm); put_pkvm_hyp_vm(hyp_vm); out: cpu_reg(host_ctxt, 1) = ret; @@ -355,9 +364,6 @@ static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) goto out; @@ -417,12 +423,8 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); - struct pkvm_hyp_vm *hyp_vm; - - if (!is_protected_kvm_enabled()) - return; + struct pkvm_hyp_vm *hyp_vm = get_np_pkvm_hyp_vm(handle); - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) return; @@ -461,11 +463,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); } static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) @@ -479,17 +481,15 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); DECLARE_REG(unsigned long, size, host_ctxt, 2); - DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3); - DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4); - DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5); + DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 3); + DECLARE_REG(u32, hyp_va_bits, host_ctxt, 4); /* * __pkvm_init() will return only if an error occurred, otherwise it * will tail-call in __pkvm_init_finalise() which will have to deal * with the host context directly. */ - cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base, - hyp_va_bits); + cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, per_cpu_base, hyp_va_bits); } static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) @@ -543,6 +543,18 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } +static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm(); +} + +static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + __pkvm_unreserve_vm(handle); +} + static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -563,11 +575,115 @@ static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); } -static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_vcpu_in_poison_fault(struct kvm_cpu_context *host_ctxt) +{ + int ret; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + + ret = hyp_vcpu ? __pkvm_vcpu_in_poison_fault(hyp_vcpu) : -EINVAL; + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_force_reclaim_guest_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_host_force_reclaim_page_guest(phys); +} + +static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, gfn); +} + +static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle); +} + +static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle); +} + +static void handle___tracing_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1); + DECLARE_REG(size_t, desc_size, host_ctxt, 2); - cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); + cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size); +} + +static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt) +{ + __tracing_unload(); +} + +static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(bool, enable, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_enable(enable); +} + +static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu); +} + +static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u32, mult, host_ctxt, 1); + DECLARE_REG(u32, shift, host_ctxt, 2); + DECLARE_REG(u64, epoch_ns, host_ctxt, 3); + DECLARE_REG(u64, epoch_cyc, host_ctxt, 4); + + __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc); +} + +static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_reset(cpu); +} + +static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned short, id, host_ctxt, 1); + DECLARE_REG(bool, enable, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable); +} + +static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, id, host_ctxt, 1); + + trace_selftest(id); +} + +static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v5_save_apr(kern_hyp_va(cpu_if)); +} + +static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if)); } typedef void (*hcall_t)(struct kvm_cpu_context *); @@ -584,14 +700,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__pkvm_prot_finalize), - HANDLE_FUNC(__pkvm_host_share_hyp), - HANDLE_FUNC(__pkvm_host_unshare_hyp), - HANDLE_FUNC(__pkvm_host_share_guest), - HANDLE_FUNC(__pkvm_host_unshare_guest), - HANDLE_FUNC(__pkvm_host_relax_perms_guest), - HANDLE_FUNC(__pkvm_host_wrprotect_guest), - HANDLE_FUNC(__pkvm_host_test_clear_young_guest), - HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), @@ -601,11 +709,37 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__tracing_load), + HANDLE_FUNC(__tracing_unload), + HANDLE_FUNC(__tracing_enable), + HANDLE_FUNC(__tracing_swap_reader), + HANDLE_FUNC(__tracing_update_clock), + HANDLE_FUNC(__tracing_reset), + HANDLE_FUNC(__tracing_enable_event), + HANDLE_FUNC(__tracing_write_event), + HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__vgic_v5_save_apr), + HANDLE_FUNC(__vgic_v5_restore_vmcr_apr), + + HANDLE_FUNC(__pkvm_host_share_hyp), + HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_donate_guest), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), + HANDLE_FUNC(__pkvm_reserve_vm), + HANDLE_FUNC(__pkvm_unreserve_vm), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), - HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_in_poison_fault), + HANDLE_FUNC(__pkvm_force_reclaim_guest_page), + HANDLE_FUNC(__pkvm_reclaim_dying_guest_page), + HANDLE_FUNC(__pkvm_start_teardown_vm), + HANDLE_FUNC(__pkvm_finalize_teardown_vm), HANDLE_FUNC(__pkvm_vcpu_load), HANDLE_FUNC(__pkvm_vcpu_put), HANDLE_FUNC(__pkvm_tlb_flush_vmid), @@ -614,9 +748,11 @@ static const hcall_t host_hcall[] = { static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); - unsigned long hcall_min = 0; + unsigned long hcall_min = 0, hcall_max = __KVM_HOST_SMCCC_FUNC_MAX; hcall_t hfn; + BUILD_BUG_ON(ARRAY_SIZE(host_hcall) != __KVM_HOST_SMCCC_FUNC_MAX); + /* * If pKVM has been initialised then reject any calls to the * early "privileged" hypercalls. Note that we cannot reject @@ -626,13 +762,16 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) * basis. This is all fine, however, since __pkvm_prot_finalize * returns -EPERM after the first call for a given CPU. */ - if (static_branch_unlikely(&kvm_protected_mode_initialized)) - hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + if (static_branch_unlikely(&kvm_protected_mode_initialized)) { + hcall_min = __KVM_HOST_SMCCC_FUNC_MIN_PKVM; + } else { + hcall_max = __KVM_HOST_SMCCC_FUNC_PKVM_ONLY; + } id &= ~ARM_SMCCC_CALL_HINTS; id -= KVM_HOST_SMCCC_ID(0); - if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) + if (unlikely(id < hcall_min || id >= hcall_max)) goto inval; hfn = host_hcall[id]; @@ -649,15 +788,27 @@ inval: static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) { + trace_hyp_exit(host_ctxt, HYP_REASON_SMC); __kvm_hyp_host_forward_smc(host_ctxt); + trace_hyp_enter(host_ctxt, HYP_REASON_SMC); } static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, func_id, host_ctxt, 0); + u64 esr = read_sysreg_el2(SYS_ESR); bool handled; + if (esr & ESR_ELx_xVC_IMM_MASK) { + cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; + goto exit_skip_instr; + } + func_id &= ~ARM_SMCCC_CALL_HINTS; + if (upper_32_bits(func_id)) { + cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; + goto exit_skip_instr; + } handled = kvm_host_psci_handler(host_ctxt, func_id); if (!handled) @@ -665,26 +816,109 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) if (!handled) default_host_smc_handler(host_ctxt); +exit_skip_instr: /* SMC was trapped, move ELR past the current PC. */ kvm_skip_host_instr(); } +void inject_host_exception(u64 esr) +{ + u64 sctlr, spsr_el1, spsr_el2, exc_offset = except_type_sync; + const u64 spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | + PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT; + + spsr_el1 = spsr_el2 = read_sysreg_el2(SYS_SPSR); + switch (spsr_el1 & (PSR_MODE_MASK | PSR_MODE32_BIT)) { + case PSR_MODE_EL0t: + exc_offset += LOWER_EL_AArch64_VECTOR; + break; + case PSR_MODE_EL0t | PSR_MODE32_BIT: + exc_offset += LOWER_EL_AArch32_VECTOR; + break; + default: + exc_offset += CURRENT_EL_SP_ELx_VECTOR; + } + + spsr_el2 &= spsr_mask; + spsr_el2 |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | + PSR_MODE_EL1h; + + sctlr = read_sysreg_el1(SYS_SCTLR); + if (!(sctlr & SCTLR_EL1_SPAN)) + spsr_el2 |= PSR_PAN_BIT; + + if (sctlr & SCTLR_ELx_DSSBS) + spsr_el2 |= PSR_SSBS_BIT; + + if (system_supports_mte()) + spsr_el2 |= PSR_TCO_BIT; + + if (esr_fsc_is_translation_fault(esr)) + write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el1(spsr_el1, SYS_SPSR); + write_sysreg_el2(read_sysreg_el1(SYS_VBAR) + exc_offset, SYS_ELR); + write_sysreg_el2(spsr_el2, SYS_SPSR); +} + +static void inject_host_undef64(void) +{ + inject_host_exception((ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) | + ESR_ELx_IL); +} + +static bool handle_host_mte(u64 esr) +{ + switch (esr_sys64_to_sysreg(esr)) { + case SYS_RGSR_EL1: + case SYS_GCR_EL1: + case SYS_TFSR_EL1: + case SYS_TFSRE0_EL1: + /* If we're here for any reason other than MTE, it's a bug. */ + if (read_sysreg(HCR_EL2) & HCR_ATA) + return false; + break; + case SYS_GMID_EL1: + /* If we're here for any reason other than MTE, it's a bug. */ + if (!(read_sysreg(HCR_EL2) & HCR_TID5)) + return false; + break; + default: + return false; + } + + inject_host_undef64(); + return true; +} + void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); + switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_HVC64: + trace_hyp_enter(host_ctxt, HYP_REASON_HVC); handle_host_hcall(host_ctxt); break; case ESR_ELx_EC_SMC64: + trace_hyp_enter(host_ctxt, HYP_REASON_SMC); handle_host_smc(host_ctxt); break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: + trace_hyp_enter(host_ctxt, HYP_REASON_HOST_ABORT); handle_host_mem_abort(host_ctxt); break; + case ESR_ELx_EC_SYS64: + if (handle_host_mte(esr)) + break; + fallthrough; default: BUG(); } + + trace_hyp_exit(host_ctxt, HYP_REASON_ERET_HOST); } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index f4562f417d3f..7a02837203d1 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -16,6 +16,12 @@ SECTIONS { HYP_SECTION(.text) HYP_SECTION(.data..ro_after_init) HYP_SECTION(.rodata) +#ifdef CONFIG_NVHE_EL2_TRACING + . = ALIGN(PAGE_SIZE); + BEGIN_HYP_SECTION(.event_ids) + *(SORT(.hyp.event_ids.*)) + END_HYP_SECTION +#endif /* * .hyp..data..percpu needs to be page aligned to maintain the same @@ -25,5 +31,7 @@ SECTIONS { BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) END_HYP_SECTION + HYP_SECTION(.bss) + HYP_SECTION(.data) } diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c index 46a2d4f2b3c6..baa6260f88dc 100644 --- a/arch/arm64/kvm/hyp/nvhe/list_debug.c +++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c @@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v) bool corruption = unlikely(condition); \ if (corruption) { \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ - BUG_ON(1); \ + BUG(); \ } else \ WARN_ON(1); \ } \ diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 19c3c631708c..25f04629014e 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -5,6 +5,7 @@ */ #include <linux/kvm_host.h> + #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> @@ -14,12 +15,14 @@ #include <hyp/fault.h> +#include <nvhe/arm-smccc.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> #include <nvhe/mm.h> +#include <nvhe/trap_handler.h> -#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) +#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_AS_S1 | KVM_PGTABLE_S2_IDMAP) struct host_mmu host_mmu; @@ -28,6 +31,19 @@ static struct hyp_pool host_s2_pool; static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); #define current_vm (*this_cpu_ptr(&__current_vm)) +static void pkvm_sme_dvmsync_fw_call(void) +{ + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) { + struct arm_smccc_res res; + + /* + * Ignore the return value. Probing for the workaround + * availability took place in init_hyp_mode(). + */ + hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + } +} + static void guest_lock_component(struct pkvm_hyp_vm *vm) { hyp_spin_lock(&vm->lock); @@ -60,6 +76,11 @@ static void hyp_unlock_component(void) hyp_spin_unlock(&pkvm_pgd_lock); } +#define for_each_hyp_page(__p, __st, __sz) \ + for (struct hyp_page *__p = hyp_phys_to_page(__st), \ + *__e = __p + ((__sz) >> PAGE_SHIFT); \ + __p < __e; __p++) + static void *host_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); @@ -161,12 +182,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) return 0; } -static bool guest_stage2_force_pte_cb(u64 addr, u64 end, - enum kvm_pgtable_prot prot) -{ - return true; -} - static void *guest_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); @@ -217,16 +232,42 @@ static void guest_s2_put_page(void *addr) hyp_put_page(¤t_vm->pool, addr); } +static void __apply_guest_page(void *va, size_t size, + void (*func)(void *addr, size_t size)) +{ + size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE); + va = PTR_ALIGN_DOWN(va, PAGE_SIZE); + size = PAGE_ALIGN(size); + + while (size) { + size_t map_size = PAGE_SIZE; + void *map; + + if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE) + map = hyp_fixblock_map(__hyp_pa(va), &map_size); + else + map = hyp_fixmap_map(__hyp_pa(va)); + + func(map, map_size); + + if (map_size == PMD_SIZE) + hyp_fixblock_unmap(); + else + hyp_fixmap_unmap(); + + size -= map_size; + va += map_size; + } +} + static void clean_dcache_guest_page(void *va, size_t size) { - __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __clean_dcache_guest_page); } static void invalidate_icache_guest_page(void *va, size_t size) { - __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __invalidate_icache_guest_page); } int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) @@ -255,8 +296,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) }; guest_lock_component(vm); - ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, - guest_stage2_force_pte_cb); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL); guest_unlock_component(vm); if (ret) return ret; @@ -266,7 +306,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { struct hyp_page *page; void *addr; @@ -300,6 +340,8 @@ int __pkvm_prot_finalize(void) params->vttbr = kvm_get_vttbr(mmu); params->vtcr = mmu->vtcr; params->hcr_el2 |= HCR_VM; + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + params->hcr_el2 |= HCR_FWB; /* * The CMO below not only cleans the updated params to the @@ -309,7 +351,7 @@ int __pkvm_prot_finalize(void) */ kvm_flush_dcache_to_poc(params, sizeof(*params)); - write_sysreg(params->hcr_el2, hcr_el2); + write_sysreg_hcr(params->hcr_el2); __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* @@ -343,6 +385,19 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +/* + * Ensure the PFN range is contained within PA-range. + * + * This check is also robust to overflows and is therefore a requirement before + * using a pfn/nr_pages pair from an untrusted source. + */ +static bool pfn_range_is_valid(u64 pfn, u64 nr_pages) +{ + u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT); + + return pfn < limit && ((limit - pfn) >= nr_pages); +} + struct kvm_mem_range { u64 start; u64 end; @@ -422,8 +477,15 @@ static bool range_is_memory(u64 start, u64 end) static inline int __host_stage2_idmap(u64 start, u64 end, enum kvm_pgtable_prot prot) { + /* + * We don't make permission changes to the host idmap after + * initialisation, so we can squash -EAGAIN to save callers + * having to treat it like success in the case that they try to + * map something that is already mapped. + */ return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start, - prot, &host_s2_pool, 0); + prot, &host_s2_pool, + KVM_PGTABLE_WALK_IGNORE_EAGAIN); } /* @@ -455,6 +517,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) { struct kvm_mem_range cur; kvm_pte_t pte; + u64 granule; s8 level; int ret; @@ -464,25 +527,29 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) return ret; if (kvm_pte_valid(pte)) - return -EAGAIN; + return -EEXIST; if (pte) { - WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE); + WARN_ON(addr_is_memory(addr) && + get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE); return -EPERM; } - do { - u64 granule = kvm_granule_size(level); + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) { + if (!kvm_level_supports_block_mapping(level)) + continue; + granule = kvm_granule_size(level); cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; - level++; - } while ((level <= KVM_PGTABLE_LAST_LEVEL) && - !(kvm_level_supports_block_mapping(level) && - range_included(&cur, range))); + if (!range_included(&cur, range) && level < KVM_PGTABLE_LAST_LEVEL) + continue; + *range = cur; + return 0; + } - *range = cur; + WARN_ON(1); - return 0; + return -EINVAL; } int host_stage2_idmap_locked(phys_addr_t addr, u64 size, @@ -493,30 +560,109 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) { - phys_addr_t end = addr + size; - - for (; addr < end; addr += PAGE_SIZE) - hyp_phys_to_page(addr)->host_state = state; + for_each_hyp_page(page, addr, size) + set_host_state(page, state); } -int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +#define KVM_HOST_DONATION_PTE_OWNER_MASK GENMASK(3, 1) +#define KVM_HOST_DONATION_PTE_EXTRA_MASK GENMASK(59, 4) +static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size, + u8 owner_id, u64 meta) { + kvm_pte_t annotation; int ret; - if (!addr_is_memory(addr)) + if (owner_id == PKVM_ID_HOST) + return -EINVAL; + + if (!range_is_memory(addr, addr + size)) return -EPERM; - ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); - if (ret) - return ret; + if (!FIELD_FIT(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id)) + return -EINVAL; - /* Don't forget to update the vmemmap tracking for the host */ - if (owner_id == PKVM_ID_HOST) - __host_update_page_state(addr, size, PKVM_PAGE_OWNED); - else + if (!FIELD_FIT(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta)) + return -EINVAL; + + annotation = FIELD_PREP(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id) | + FIELD_PREP(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta); + ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, + addr, size, &host_s2_pool, + KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation); + if (!ret) { + /* + * After stage2 maintenance has happened, but before the page + * owner has changed. + */ + pkvm_sme_dvmsync_fw_call(); __host_update_page_state(addr, size, PKVM_NOPAGE); + } + + return ret; +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + int ret = -EINVAL; + + switch (owner_id) { + case PKVM_ID_HOST: + if (!range_is_memory(addr, addr + size)) + return -EPERM; + + ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); + if (!ret) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + break; + case PKVM_ID_HYP: + ret = host_stage2_set_owner_metadata_locked(addr, size, + owner_id, 0); + break; + } + + return ret; +} + +#define KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK GENMASK(15, 0) +/* We need 40 bits for the GFN to cover a 52-bit IPA with 4k pages and LPA2 */ +#define KVM_HOST_PTE_OWNER_GUEST_GFN_MASK GENMASK(55, 16) +static u64 host_stage2_encode_gfn_meta(struct pkvm_hyp_vm *vm, u64 gfn) +{ + pkvm_handle_t handle = vm->kvm.arch.pkvm.handle; + + BUILD_BUG_ON((pkvm_handle_t)-1 > KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK); + WARN_ON(!FIELD_FIT(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn)); + + return FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, handle) | + FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn); +} + +static int host_stage2_decode_gfn_meta(kvm_pte_t pte, struct pkvm_hyp_vm **vm, + u64 *gfn) +{ + pkvm_handle_t handle; + u64 meta; + + if (WARN_ON(kvm_pte_valid(pte))) + return -EINVAL; + + if (FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) != + KVM_HOST_INVALID_PTE_TYPE_DONATION) { + return -EINVAL; + } + + if (FIELD_GET(KVM_HOST_DONATION_PTE_OWNER_MASK, pte) != PKVM_ID_GUEST) + return -EPERM; + meta = FIELD_GET(KVM_HOST_DONATION_PTE_EXTRA_MASK, pte); + handle = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, meta); + *vm = get_vm_by_handle(handle); + if (!*vm) { + /* We probably raced with teardown; try again */ + return -EAGAIN; + } + + *gfn = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, meta); return 0; } @@ -563,11 +709,43 @@ unlock: return ret; } +static void host_inject_mem_abort(struct kvm_cpu_context *host_ctxt) +{ + u64 ec, esr, spsr; + + esr = read_sysreg_el2(SYS_ESR); + spsr = read_sysreg_el2(SYS_SPSR); + + /* Repaint the ESR to report a same-level fault if taken from EL1 */ + if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) { + ec = ESR_ELx_EC(esr); + if (ec == ESR_ELx_EC_DABT_LOW) + ec = ESR_ELx_EC_DABT_CUR; + else if (ec == ESR_ELx_EC_IABT_LOW) + ec = ESR_ELx_EC_IABT_CUR; + else + WARN_ON(1); + esr &= ~ESR_ELx_EC_MASK; + esr |= ec << ESR_ELx_EC_SHIFT; + } + + /* + * Since S1PTW should only ever be set for stage-2 faults, we're pretty + * much guaranteed that it won't be set in ESR_EL1 by the hardware. So, + * let's use that bit to allow the host abort handler to differentiate + * this abort from normal userspace faults. + * + * Note: although S1PTW is RES0 at EL1, it is guaranteed by the + * architecture to be backed by flops, so it should be safe to use. + */ + esr |= ESR_ELx_S1PTW; + inject_host_exception(esr); +} + void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) { struct kvm_vcpu_fault_info fault; u64 esr, addr; - int ret = 0; esr = read_sysreg_el2(SYS_ESR); if (!__get_fault_info(esr, &fault)) { @@ -578,9 +756,24 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) return; } - addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; - ret = host_stage2_idmap(addr); - BUG_ON(ret && ret != -EAGAIN); + + /* + * Yikes, we couldn't resolve the fault IPA. This should reinject an + * abort into the host when we figure out how to do that. + */ + BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS)); + addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; + + switch (host_stage2_idmap(addr)) { + case -EPERM: + host_inject_mem_abort(host_ctxt); + fallthrough; + case -EEXIST: + case 0: + break; + default: + BUG(); + } } struct check_walk_data { @@ -611,16 +804,16 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - u64 end = addr + size; int ret; - ret = check_range_allowed_memory(addr, end); + ret = check_range_allowed_memory(addr, addr + size); if (ret) return ret; hyp_assert_lock_held(&host_mmu.lock); - for (; addr < end; addr += PAGE_SIZE) { - if (hyp_phys_to_page(addr)->host_state != state) + + for_each_hyp_page(page, addr, size) { + if (get_host_state(page) != state) return -EPERM; } @@ -630,7 +823,7 @@ static int __host_check_page_state_range(u64 addr, u64 size, static int __host_set_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) { + if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) { int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); if (ret) @@ -642,38 +835,45 @@ static int __host_set_page_state_range(u64 addr, u64 size, return 0; } -static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) +static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - if (!kvm_pte_valid(pte)) - return PKVM_NOPAGE; + for_each_hyp_page(page, phys, size) + set_hyp_state(page, state); +} - return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); +static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) +{ + for_each_hyp_page(page, phys, size) { + if (get_hyp_state(page) != state) + return -EPERM; + } + + return 0; } -static int __hyp_check_page_state_range(u64 addr, u64 size, - enum pkvm_page_state state) +static bool guest_pte_is_poisoned(kvm_pte_t pte) { - struct check_walk_data d = { - .desired = state, - .get_page_state = hyp_get_page_state, - }; + if (kvm_pte_valid(pte)) + return false; - hyp_assert_lock_held(&pkvm_pgd_lock); - return check_page_state_range(&pkvm_pgtable, addr, size, &d); + return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) == + KVM_GUEST_INVALID_PTE_TYPE_POISONED; } static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { + if (guest_pte_is_poisoned(pte)) + return PKVM_POISON; + if (!kvm_pte_valid(pte)) return PKVM_NOPAGE; return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); } -static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, +static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr, u64 size, enum pkvm_page_state state) { - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); struct check_walk_data d = { .desired = state, .get_page_state = guest_get_page_state, @@ -683,11 +883,80 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, return check_page_state_range(&vm->pgt, addr, size, &d); } +static int get_valid_guest_pte(struct pkvm_hyp_vm *vm, u64 ipa, kvm_pte_t *ptep, u64 *physp) +{ + kvm_pte_t pte; + u64 phys; + s8 level; + int ret; + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + return ret; + if (guest_pte_is_poisoned(pte)) + return -EHWPOISON; + if (!kvm_pte_valid(pte)) + return -ENOENT; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; + + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (WARN_ON(ret)) + return ret; + + *ptep = pte; + *physp = phys; + + return 0; +} + +int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + kvm_pte_t pte; + s8 level; + u64 ipa; + int ret; + + switch (kvm_vcpu_trap_get_class(&hyp_vcpu->vcpu)) { + case ESR_ELx_EC_DABT_LOW: + case ESR_ELx_EC_IABT_LOW: + if (kvm_vcpu_trap_is_translation_fault(&hyp_vcpu->vcpu)) + break; + fallthrough; + default: + return -EINVAL; + } + + /* + * The host has the faulting IPA when it calls us from the guest + * fault handler but we retrieve it ourselves from the FAR so as + * to avoid exposing an "oracle" that could reveal data access + * patterns of the guest after initial donation of its pages. + */ + ipa = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu); + ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(&hyp_vcpu->vcpu)); + + guest_lock_component(vm); + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + goto unlock; + + if (level != KVM_PGTABLE_LAST_LEVEL) { + ret = -EINVAL; + goto unlock; + } + + ret = guest_pte_is_poisoned(pte); +unlock: + guest_unlock_component(vm); + return ret; +} + int __pkvm_host_share_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); - void *virt = __hyp_va(phys); - enum kvm_pgtable_prot prot; u64 size = PAGE_SIZE; int ret; @@ -697,14 +966,11 @@ int __pkvm_host_share_hyp(u64 pfn) ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (ret) goto unlock; - if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { - ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); - if (ret) - goto unlock; - } + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); - WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); unlock: @@ -714,6 +980,72 @@ unlock: return ret; } +int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys, ipa = hyp_pfn_to_phys(gfn); + kvm_pte_t pte; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = get_valid_guest_pte(vm, ipa, &pte, &phys); + if (ret) + goto unlock; + + ret = -EPERM; + if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_OWNED) + goto unlock; + if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE)) + goto unlock; + + ret = 0; + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_SHARED_OWNED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)); +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 meta, phys, ipa = hyp_pfn_to_phys(gfn); + kvm_pte_t pte; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = get_valid_guest_pte(vm, ipa, &pte, &phys); + if (ret) + goto unlock; + + ret = -EPERM; + if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_SHARED_OWNED) + goto unlock; + if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)) + goto unlock; + + ret = 0; + meta = host_stage2_encode_gfn_meta(vm, gfn); + WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, + PKVM_ID_GUEST, meta)); + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + int __pkvm_host_unshare_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); @@ -727,7 +1059,7 @@ int __pkvm_host_unshare_hyp(u64 pfn) ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (ret) goto unlock; - ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); if (ret) goto unlock; if (hyp_page_count((void *)virt)) { @@ -735,7 +1067,7 @@ int __pkvm_host_unshare_hyp(u64 pfn) goto unlock; } - WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); unlock: @@ -750,23 +1082,23 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) u64 phys = hyp_pfn_to_phys(pfn); u64 size = PAGE_SIZE * nr_pages; void *virt = __hyp_va(phys); - enum kvm_pgtable_prot prot; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); hyp_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (ret) goto unlock; - if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { - ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); - if (ret) - goto unlock; - } + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); - WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP)); WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); unlock: @@ -783,18 +1115,20 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) u64 virt = (u64)__hyp_va(phys); int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); hyp_lock_component(); - ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); if (ret) goto unlock; - if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { - ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); - if (ret) - goto unlock; - } + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); @@ -809,24 +1143,30 @@ int hyp_pin_shared_mem(void *from, void *to) { u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); u64 end = PAGE_ALIGN((u64)to); + u64 phys = __hyp_pa(start); u64 size = end - start; + struct hyp_page *p; int ret; host_lock_component(); hyp_lock_component(); - ret = __host_check_page_state_range(__hyp_pa(start), size, - PKVM_PAGE_SHARED_OWNED); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (ret) goto unlock; - ret = __hyp_check_page_state_range(start, size, - PKVM_PAGE_SHARED_BORROWED); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); if (ret) goto unlock; - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_inc(hyp_virt_to_page(cur)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + hyp_page_ref_inc(p); + if (p->refcount == 1) + WARN_ON(pkvm_create_mappings_locked((void *)cur, + (void *)cur + PAGE_SIZE, + PAGE_HYP)); + } unlock: hyp_unlock_component(); @@ -839,12 +1179,17 @@ void hyp_unpin_shared_mem(void *from, void *to) { u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); u64 end = PAGE_ALIGN((u64)to); + struct hyp_page *p; host_lock_component(); hyp_lock_component(); - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_dec(hyp_virt_to_page(cur)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + if (p->refcount == 1) + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE); + hyp_page_ref_dec(p); + } hyp_unlock_component(); host_unlock_component(); @@ -856,6 +1201,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) u64 size = PAGE_SIZE * nr_pages; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (!ret) @@ -871,6 +1219,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) u64 size = PAGE_SIZE * nr_pages; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (!ret) @@ -880,49 +1231,281 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) return ret; } -int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, - enum kvm_pgtable_prot prot) +static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size) { - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); - u64 phys = hyp_pfn_to_phys(pfn); - u64 ipa = hyp_pfn_to_phys(gfn); - struct hyp_page *page; - int ret; + size_t block_size; - if (prot & ~KVM_PGTABLE_PROT_RWX) + if (nr_pages == 1) { + *size = PAGE_SIZE; + return 0; + } + + /* We solely support second to last level huge mapping */ + block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1); + + if (nr_pages != block_size >> PAGE_SHIFT) return -EINVAL; - ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (!IS_ALIGNED(phys | ipa, block_size)) + return -EINVAL; + + *size = block_size; + return 0; +} + +static void hyp_poison_page(phys_addr_t phys) +{ + void *addr = hyp_fixmap_map(phys); + + memset(addr, 0, PAGE_SIZE); + /* + * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page() + * here as the latter may elide the CMO under the assumption that FWB + * will be enabled on CPUs that support it. This is incorrect for the + * host stage-2 and would otherwise lead to a malicious host potentially + * being able to read the contents of newly reclaimed guest pages. + */ + kvm_flush_dcache_to_poc(addr, PAGE_SIZE); + hyp_fixmap_unmap(); +} + +static int host_stage2_get_guest_info(phys_addr_t phys, struct pkvm_hyp_vm **vm, + u64 *gfn) +{ + enum pkvm_page_state state; + kvm_pte_t pte; + s8 level; + int ret; + + if (!addr_is_memory(phys)) + return -EFAULT; + + state = get_host_state(hyp_phys_to_page(phys)); + switch (state) { + case PKVM_PAGE_OWNED: + case PKVM_PAGE_SHARED_OWNED: + case PKVM_PAGE_SHARED_BORROWED: + /* The access should no longer fault; try again. */ + return -EAGAIN; + case PKVM_NOPAGE: + break; + default: + return -EPERM; + } + + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level); if (ret) return ret; + if (WARN_ON(level != KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + return host_stage2_decode_gfn_meta(pte, vm, gfn); +} + +int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys) +{ + struct pkvm_hyp_vm *vm; + u64 gfn, ipa, pa; + kvm_pte_t pte; + int ret; + + phys &= PAGE_MASK; + + hyp_spin_lock(&vm_table_lock); host_lock_component(); + + ret = host_stage2_get_guest_info(phys, &vm, &gfn); + if (ret) + goto unlock_host; + + ipa = hyp_pfn_to_phys(gfn); guest_lock_component(vm); + ret = get_valid_guest_pte(vm, ipa, &pte, &pa); + if (ret) + goto unlock_guest; - ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE); + WARN_ON(pa != phys); + if (guest_get_page_state(pte, ipa) != PKVM_PAGE_OWNED) { + ret = -EPERM; + goto unlock_guest; + } + + /* We really shouldn't be allocating, so don't pass a memcache */ + ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, NULL, + KVM_GUEST_INVALID_PTE_TYPE_POISONED, + 0); + if (ret) + goto unlock_guest; + + hyp_poison_page(phys); + WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST)); +unlock_guest: + guest_unlock_component(vm); +unlock_host: + host_unlock_component(); + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + kvm_pte_t pte; + u64 phys; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = get_valid_guest_pte(vm, ipa, &pte, &phys); if (ret) goto unlock; - page = hyp_phys_to_page(phys); - switch (page->host_state) { + switch (guest_get_page_state(pte, ipa)) { case PKVM_PAGE_OWNED: - WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE)); + hyp_poison_page(phys); break; case PKVM_PAGE_SHARED_OWNED: - if (page->host_share_guest_count) - break; - /* Only host to np-guest multi-sharing is tolerated */ - WARN_ON(1); - fallthrough; + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)); + break; default: ret = -EPERM; goto unlock; } + WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE)); + WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + /* + * -EHWPOISON implies that the page was forcefully reclaimed already + * so return success for the GUP pin to be dropped. + */ + return ret && ret != -EHWPOISON ? ret : 0; +} + +/* + * share/donate install at most one stage-2 leaf (PAGE_SIZE, or one + * KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages() + * bounds the worst-case allocation: exact for the PAGE_SIZE leaf, + * conservative by one for the block. + */ +static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu)) + return -ENOMEM; + + return 0; +} + +int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 meta; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + + ret = __guest_check_page_state_range(vm, ipa, PAGE_SIZE, PKVM_NOPAGE); + if (ret) + goto unlock; + + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + + meta = host_stage2_encode_gfn_meta(vm, gfn); + WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, + PKVM_ID_GUEST, meta)); WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size; + int ret; + + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); + if (ret) + return ret; + + ret = check_range_allowed_memory(phys, phys + size); + if (ret) + return ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE); + if (ret) + goto unlock; + + for_each_hyp_page(page, phys, size) { + switch (get_host_state(page)) { + case PKVM_PAGE_OWNED: + continue; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count == U32_MAX) { + ret = -EBUSY; + goto unlock; + } + + /* Only host to np-guest multi-sharing is tolerated */ + if (page->host_share_guest_count) + continue; + + fallthrough; + default: + ret = -EPERM; + goto unlock; + } + } + + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + + for_each_hyp_page(page, phys, size) { + set_host_state(page, PKVM_PAGE_SHARED_OWNED); + page->host_share_guest_count++; + } + + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys, pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), &vcpu->vcpu.arch.pkvm_memcache, 0)); - page->host_share_guest_count++; unlock: guest_unlock_component(vm); @@ -931,10 +1514,9 @@ unlock: return ret; } -static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa) +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size) { enum pkvm_page_state state; - struct hyp_page *page; kvm_pte_t pte; u64 phys; s8 level; @@ -945,51 +1527,60 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip return ret; if (!kvm_pte_valid(pte)) return -ENOENT; - if (level != KVM_PGTABLE_LAST_LEVEL) + if (size && kvm_granule_size(level) != size) return -E2BIG; + if (!size) + size = kvm_granule_size(level); + state = guest_get_page_state(pte, ipa); if (state != PKVM_PAGE_SHARED_BORROWED) return -EPERM; phys = kvm_pte_to_phys(pte); - ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + ret = check_range_allowed_memory(phys, phys + size); if (WARN_ON(ret)) return ret; - page = hyp_phys_to_page(phys); - if (page->host_state != PKVM_PAGE_SHARED_OWNED) - return -EPERM; - if (WARN_ON(!page->host_share_guest_count)) - return -EINVAL; + for_each_hyp_page(page, phys, size) { + if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; + } *__phys = phys; return 0; } -int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { u64 ipa = hyp_pfn_to_phys(gfn); - struct hyp_page *page; - u64 phys; + u64 size, phys; int ret; + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + host_lock_component(); guest_lock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); + ret = __check_host_shared_guest(vm, &phys, ipa, size); if (ret) goto unlock; - ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE); + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size); if (ret) goto unlock; - page = hyp_phys_to_page(phys); - page->host_share_guest_count--; - if (!page->host_share_guest_count) - WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED)); + for_each_hyp_page(page, phys, size) { + /* __check_host_shared_guest() protects against underflow */ + page->host_share_guest_count--; + if (!page->host_share_guest_count) + set_host_state(page, PKVM_PAGE_OWNED); + } unlock: guest_unlock_component(vm); @@ -998,7 +1589,7 @@ unlock: return ret; } -static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size) { u64 phys; int ret; @@ -1009,7 +1600,7 @@ static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) host_lock_component(); guest_lock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); + ret = __check_host_shared_guest(vm, &phys, ipa, size); guest_unlock_component(vm); host_unlock_component(); @@ -1029,7 +1620,7 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_ if (prot & ~KVM_PGTABLE_PROT_RWX) return -EINVAL; - assert_host_shared_guest(vm, ipa); + assert_host_shared_guest(vm, ipa, 0); guest_lock_component(vm); ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); @@ -1037,33 +1628,41 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_ return ret; } -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { - u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + assert_host_shared_guest(vm, ipa, size); guest_lock_component(vm); - ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size); guest_unlock_component(vm); return ret; } -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm) { - u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + assert_host_shared_guest(vm, ipa, size); guest_lock_component(vm); - ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold); guest_unlock_component(vm); return ret; @@ -1077,10 +1676,241 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa); + assert_host_shared_guest(vm, ipa, 0); guest_lock_component(vm); kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); return 0; } + +#ifdef CONFIG_NVHE_EL2_DEBUG +struct pkvm_expected_state { + enum pkvm_page_state host; + enum pkvm_page_state hyp; + enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */ +}; + +static struct pkvm_expected_state selftest_state; +static struct hyp_page *selftest_page; +static struct pkvm_hyp_vcpu *selftest_vcpu; + +static u64 selftest_ipa(void) +{ + return BIT(selftest_vcpu->vcpu.arch.hw_mmu->pgt->ia_bits - 1); +} + +static void assert_page_state(void) +{ + void *virt = hyp_page_to_virt(selftest_page); + u64 size = PAGE_SIZE << selftest_page->order; + struct pkvm_hyp_vcpu *vcpu = selftest_vcpu; + u64 phys = hyp_virt_to_phys(virt); + u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE }; + struct pkvm_hyp_vm *vm; + + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + host_lock_component(); + WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host)); + host_unlock_component(); + + hyp_lock_component(); + WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp)); + hyp_unlock_component(); + + guest_lock_component(vm); + WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0])); + WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1])); + guest_unlock_component(vm); +} + +#define assert_transition_res(res, fn, ...) \ + do { \ + WARN_ON(fn(__VA_ARGS__) != res); \ + assert_page_state(); \ + } while (0) + +void pkvm_ownership_selftest(void *base) +{ + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX; + void *virt = hyp_alloc_pages(&host_s2_pool, 0); + struct pkvm_hyp_vcpu *vcpu; + u64 phys, size, pfn, gfn; + struct pkvm_hyp_vm *vm; + + WARN_ON(!virt); + selftest_page = hyp_virt_to_page(virt); + selftest_page->refcount = 0; + selftest_vcpu = vcpu = init_selftest_vm(base); + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + size = PAGE_SIZE << selftest_page->order; + phys = hyp_virt_to_phys(virt); + pfn = hyp_phys_to_pfn(phys); + gfn = hyp_phys_to_pfn(selftest_ipa()); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE; + assert_page_state(); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + hyp_unpin_shared_mem(virt, virt + size); + WARN_ON(hyp_page_count(virt) != 1); + assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + + hyp_unpin_shared_mem(virt, virt + size); + assert_page_state(); + WARN_ON(hyp_page_count(virt)); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_hyp, pfn); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2); + + selftest_state.guest[0] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.guest[1] = PKVM_NOPAGE; + selftest_state.host = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.guest[0] = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_BORROWED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_OWNED; + assert_transition_res(0, __pkvm_guest_share_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_guest_share_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.guest[0] = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_guest_unshare_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_guest_unshare_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.guest[0] = PKVM_POISON; + assert_transition_res(0, __pkvm_host_force_reclaim_page_guest, phys); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EHWPOISON, __pkvm_guest_share_host, vcpu, gfn); + assert_transition_res(-EHWPOISON, __pkvm_guest_unshare_host, vcpu, gfn); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.guest[1] = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.guest[1] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_reclaim_page_guest, gfn + 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1); + + teardown_selftest_vm(); + selftest_page->refcount = 1; + hyp_put_page(&host_s2_pool, virt); +} +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index f41c7440b34b..3b0bee496bff 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -229,9 +229,8 @@ int hyp_map_vectors(void) return 0; } -void *hyp_fixmap_map(phys_addr_t phys) +static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys) { - struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); kvm_pte_t pte, *ptep = slot->ptep; pte = *ptep; @@ -243,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys) return (void *)slot->addr; } +void *hyp_fixmap_map(phys_addr_t phys) +{ + return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys) + offset_in_page(phys); +} + static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) { kvm_pte_t *ptep = slot->ptep; u64 addr = slot->addr; + u32 level; + + if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE) + level = KVM_PGTABLE_LAST_LEVEL; + else + level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */ WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); @@ -260,8 +270,8 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL); - dsb(ish); + __tlbi_level(vale2is, addr, level); + __tlbi_sync_s1ish_hyp(); isb(); } @@ -273,9 +283,9 @@ void hyp_fixmap_unmap(void) static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); + struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg; - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL) + if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level)) return -EINVAL; slot->addr = ctx->addr; @@ -296,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu) struct kvm_pgtable_walker walker = { .cb = __create_fixmap_slot_cb, .flags = KVM_PGTABLE_WALK_LEAF, - .arg = (void *)cpu, + .arg = per_cpu_ptr(&fixmap_slots, cpu), }; return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); } -int hyp_create_pcpu_fixmap(void) +#if PAGE_SHIFT < 16 +#define HAS_FIXBLOCK +static struct hyp_fixmap_slot hyp_fixblock_slot; +static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock); +#endif + +static int create_fixblock(void) +{ +#ifdef HAS_FIXBLOCK + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &hyp_fixblock_slot, + }; + unsigned long addr; + phys_addr_t phys; + int ret, i; + + /* Find a RAM phys address, PMD aligned */ + for (i = 0; i < hyp_memblock_nr; i++) { + phys = ALIGN(hyp_memory[i].base, PMD_SIZE); + if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size)) + break; + } + + if (i >= hyp_memblock_nr) + return -EINVAL; + + hyp_spin_lock(&pkvm_pgd_lock); + addr = ALIGN(__io_map_base, PMD_SIZE); + ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE); + if (ret) + goto unlock; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP); + if (ret) + goto unlock; + + ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +#else + return 0; +#endif +} + +void *hyp_fixblock_map(phys_addr_t phys, size_t *size) +{ +#ifdef HAS_FIXBLOCK + *size = PMD_SIZE; + hyp_spin_lock(&hyp_fixblock_lock); + return fixmap_map_slot(&hyp_fixblock_slot, phys) + offset_in_page(phys); +#else + *size = PAGE_SIZE; + return hyp_fixmap_map(phys); +#endif +} + +void hyp_fixblock_unmap(void) +{ +#ifdef HAS_FIXBLOCK + fixmap_clear_slot(&hyp_fixblock_slot); + hyp_spin_unlock(&hyp_fixblock_lock); +#else + hyp_fixmap_unmap(); +#endif +} + +int hyp_create_fixmap(void) { unsigned long addr, i; int ret; @@ -322,7 +403,7 @@ int hyp_create_pcpu_fixmap(void) return ret; } - return 0; + return create_fixblock(); } int hyp_create_idmap(u32 hyp_va_bits) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 3927fe52a3dd..eb1c10120f9f 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -4,6 +4,8 @@ * Author: Fuad Tabba <tabba@google.com> */ +#include <kvm/arm_hypercalls.h> + #include <linux/kvm_host.h> #include <linux/mm.h> @@ -23,8 +25,8 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; /* - * The currently loaded hyp vCPU for each physical CPU. Used only when - * protected KVM is enabled, but for both protected and non-protected VMs. + * The currently loaded hyp vCPU for each physical CPU. Used in protected mode + * for both protected and non-protected VMs. */ static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); @@ -46,7 +48,8 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_FWB; if (cpus_have_final_cap(ARM64_HAS_EVT) && - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) vcpu->arch.hcr_el2 |= HCR_TID4; else vcpu->arch.hcr_el2 |= HCR_TID2; @@ -81,7 +84,7 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) val &= ~(HCR_AMVOFFEN); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + if (!kvm_has_mte(kvm)) { val |= HCR_TID5; val &= ~(HCR_DCT | HCR_ATA); } @@ -116,8 +119,8 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) val |= MDCR_EL2_TTRF; - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) - val |= MDCR_EL2_E2TB_MASK; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) + val &= ~MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) @@ -134,7 +137,7 @@ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - /* Protected KVM does not support AArch32 guests. */ + /* No AArch32 support for protected guests. */ if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) return -EINVAL; @@ -166,8 +169,13 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) pkvm_vcpu_reset_hcr(vcpu); - if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Trust the host for non-protected vcpu features. */ + vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; return 0; + } ret = pkvm_check_pvm_cpu_features(vcpu); if (ret) @@ -175,6 +183,7 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) pvm_init_traps_hcr(vcpu); pvm_init_traps_mdcr(vcpu); + vcpu_set_hcrx(vcpu); return 0; } @@ -185,6 +194,11 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) */ #define HANDLE_OFFSET 0x1000 +/* + * Marks a reserved but not yet used entry in the VM table. + */ +#define RESERVED_ENTRY ((void *)0xa110ca7ed) + static unsigned int vm_handle_to_idx(pkvm_handle_t handle) { return handle - HANDLE_OFFSET; @@ -203,13 +217,14 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) DEFINE_HYP_SPINLOCK(vm_table_lock); /* - * The table of VM entries for protected VMs in hyp. - * Allocated at hyp initialization and setup. + * A table that tracks all VMs in protected mode. + * Allocated during hyp initialization and setup. */ static struct pkvm_hyp_vm **vm_table; void pkvm_hyp_vm_table_init(void *tbl) { + BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1); WARN_ON(vm_table); vm_table = tbl; } @@ -217,13 +232,19 @@ void pkvm_hyp_vm_table_init(void *tbl) /* * Return the hyp vm structure corresponding to the handle. */ -static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) +struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) { unsigned int idx = vm_handle_to_idx(handle); + hyp_assert_lock_held(&vm_table_lock); + if (unlikely(idx >= KVM_MAX_PVMS)) return NULL; + /* A reserved entry doesn't represent an initialized VM. */ + if (unlikely(vm_table[idx] == RESERVED_ENTRY)) + return NULL; + return vm_table[idx]; } @@ -239,10 +260,16 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) + goto unlock; + + if (hyp_vm->kvm.created_vcpus <= vcpu_idx) goto unlock; - hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + /* Pairs with smp_store_release() in register_hyp_vcpu(). */ + hyp_vcpu = smp_load_acquire(&hyp_vm->vcpus[vcpu_idx]); + if (!hyp_vcpu) + goto unlock; /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { @@ -315,33 +342,44 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); - if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) - set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + /* CTR_EL0 is always under host control, even for protected VMs. */ + hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + + /* Preserve the vgic model so that GICv3 emulation works */ + hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { hyp_vm->kvm.arch.flags = host_arch_flags; + hyp_vm->kvm.arch.flags &= ~BIT_ULL(KVM_ARCH_FLAG_ID_REGS_INITIALIZED); bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) + hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; + return; } + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_MTE)) + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_MTE_ENABLED); + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PMU_V3)) set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_ADDRESS)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_SVE)) { set_bit(KVM_ARM_VCPU_SVE, allowed_features); kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); } @@ -356,74 +394,157 @@ static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); } +static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + void *sve_state; + + if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) + return; + + sve_state = hyp_vcpu->vcpu.arch.sve_state; + hyp_unpin_shared_mem(sve_state, + sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); +} + static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], unsigned int nr_vcpus) { int i; - for (i = 0; i < nr_vcpus; i++) - unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + if (!hyp_vcpu) + continue; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + unpin_host_sve_state(hyp_vcpu); + } } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, - unsigned int nr_vcpus) + unsigned int nr_vcpus, pkvm_handle_t handle) { + struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; + int idx = vm_handle_to_idx(handle); + + hyp_vm->kvm.arch.pkvm.handle = handle; + hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; - hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; - hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected); + hyp_vm->kvm.arch.pkvm.is_created = true; hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); + + /* VMID 0 is reserved for the host */ + atomic64_set(&mmu->vmid.id, idx + 1); + + mmu->vtcr = host_mmu.arch.mmu.vtcr; + mmu->arch = &hyp_vm->kvm.arch; + mmu->pgt = &hyp_vm->pgt; } -static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) { struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + unsigned int sve_max_vl; + size_t sve_state_size; + void *sve_state; + int ret = 0; - if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + return 0; + } + + /* Limit guest vector length to the maximum supported by the host. */ + sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl); + sve_state_size = sve_state_size_from_vl(sve_max_vl); + sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state)); + + if (!sve_state || !sve_state_size) { + ret = -EINVAL; + goto err; + } + + ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size); + if (ret) + goto err; + + vcpu->arch.sve_state = sve_state; + vcpu->arch.sve_max_vl = sve_max_vl; + + return 0; +err: + clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features); + return ret; +} + +static int vm_copy_id_regs(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + const struct kvm *host_kvm = hyp_vm->host_kvm; + struct kvm *kvm = &hyp_vm->kvm; + + if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &host_kvm->arch.flags)) + return -EINVAL; + + if (test_and_set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return 0; + + memcpy(kvm->arch.id_regs, host_kvm->arch.id_regs, sizeof(kvm->arch.id_regs)); + + return 0; +} + +static int pkvm_vcpu_init_sysregs(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + int ret = 0; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + else + ret = vm_copy_id_regs(hyp_vcpu); + + return ret; } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, - struct kvm_vcpu *host_vcpu, - unsigned int vcpu_idx) + struct kvm_vcpu *host_vcpu) { int ret = 0; if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) return -EBUSY; - if (host_vcpu->vcpu_idx != vcpu_idx) { - ret = -EINVAL; - goto done; - } - hyp_vcpu->host_vcpu = host_vcpu; hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); - hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; - if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) - kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + ret = pkvm_vcpu_init_sysregs(hyp_vcpu); + if (ret) + goto done; ret = pkvm_vcpu_init_traps(hyp_vcpu); if (ret) goto done; - pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); + ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); return ret; } -static int find_free_vm_table_entry(struct kvm *host_kvm) +static int find_free_vm_table_entry(void) { int i; @@ -436,15 +557,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm) } /* - * Allocate a VM table entry and insert a pointer to the new vm. + * Reserve a VM table entry. * - * Return a unique handle to the protected VM on success, + * Return a unique handle to the VM on success, * negative error code on failure. */ -static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, - struct pkvm_hyp_vm *hyp_vm) +static int allocate_vm_table_entry(void) { - struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu; int idx; hyp_assert_lock_held(&vm_table_lock); @@ -457,20 +576,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm, if (unlikely(!vm_table)) return -EINVAL; - idx = find_free_vm_table_entry(host_kvm); - if (idx < 0) + idx = find_free_vm_table_entry(); + if (unlikely(idx < 0)) return idx; - hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx); + vm_table[idx] = RESERVED_ENTRY; - /* VMID 0 is reserved for the host */ - atomic64_set(&mmu->vmid.id, idx + 1); + return idx; +} - mmu->arch = &hyp_vm->kvm.arch; - mmu->pgt = &hyp_vm->pgt; +static int __insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + unsigned int idx; + + hyp_assert_lock_held(&vm_table_lock); + + /* + * Initializing protected state might have failed, yet a malicious + * host could trigger this function. Thus, ensure that 'vm_table' + * exists. + */ + if (unlikely(!vm_table)) + return -EINVAL; + + idx = vm_handle_to_idx(handle); + if (unlikely(idx >= KVM_MAX_PVMS)) + return -EINVAL; + + if (unlikely(vm_table[idx] != RESERVED_ENTRY)) + return -EINVAL; vm_table[idx] = hyp_vm; - return hyp_vm->kvm.arch.pkvm.handle; + + return 0; +} + +/* + * Insert a pointer to the initialized VM into the VM table. + * + * Return 0 on success, or negative error code on failure. + */ +static int insert_vm_table_entry(pkvm_handle_t handle, + struct pkvm_hyp_vm *hyp_vm) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = __insert_vm_table_entry(handle, hyp_vm); + hyp_spin_unlock(&vm_table_lock); + + return ret; } /* @@ -537,10 +693,108 @@ static void unmap_donated_memory_noclear(void *va, size_t size) } /* - * Initialize the hypervisor copy of the protected VM state using the - * memory donated by the host. + * Reserves an entry in the hypervisor for a new VM in protected mode. + * + * Return a unique handle to the VM on success, negative error code on failure. + */ +int __pkvm_reserve_vm(void) +{ + int ret; + + hyp_spin_lock(&vm_table_lock); + ret = allocate_vm_table_entry(); + hyp_spin_unlock(&vm_table_lock); + + if (ret < 0) + return ret; + + return idx_to_vm_handle(ret); +} + +/* + * Removes a reserved entry, but only if is hasn't been used yet. + * Otherwise, the VM needs to be destroyed. + */ +void __pkvm_unreserve_vm(pkvm_handle_t handle) +{ + unsigned int idx = vm_handle_to_idx(handle); + + if (unlikely(!vm_table)) + return; + + hyp_spin_lock(&vm_table_lock); + if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY)) + remove_vm_table_entry(handle); + hyp_spin_unlock(&vm_table_lock); +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +static struct pkvm_hyp_vm selftest_vm = { + .kvm = { + .arch = { + .mmu = { + .arch = &selftest_vm.kvm.arch, + .pgt = &selftest_vm.pgt, + }, + }, + }, +}; + +static struct pkvm_hyp_vcpu selftest_vcpu = { + .vcpu = { + .arch = { + .hw_mmu = &selftest_vm.kvm.arch.mmu, + }, + .kvm = &selftest_vm.kvm, + }, +}; + +struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) +{ + struct hyp_page *p = hyp_virt_to_page(virt); + unsigned long min_pages, seeded = 0; + int i; + + selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + + /* + * Mirror pkvm_refill_memcache() for the share/donate pre-checks; + * the selftest invokes those functions directly and would + * otherwise see an empty memcache. + */ + min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu); + + for (i = 0; i < pkvm_selftest_pages(); i++) { + if (p[i].refcount) + continue; + p[i].refcount = 1; + if (seeded < min_pages) { + push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache, + hyp_page_to_virt(&p[i]), hyp_virt_to_phys); + seeded++; + } else { + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } + } + + selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); + insert_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle, &selftest_vm); + return &selftest_vcpu; +} + +void teardown_selftest_vm(void) +{ + hyp_spin_lock(&vm_table_lock); + remove_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle); + hyp_spin_unlock(&vm_table_lock); +} +#endif /* CONFIG_NVHE_EL2_DEBUG */ + +/* + * Initialize the hypervisor copy of the VM state using host-donated memory. * - * Unmaps the donated memory from the host at stage 2. + * Unmap the donated memory from the host at stage 2. * * host_kvm: A pointer to the host's struct kvm. * vm_hva: The host va of the area being donated for the VM state. @@ -549,8 +803,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size) * the VM. Must be page aligned. Its size is implied by the VM's * VTCR. * - * Return a unique handle to the protected VM on success, - * negative error code on failure. + * Return 0 success, negative error code on failure. */ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva) @@ -558,6 +811,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, struct pkvm_hyp_vm *hyp_vm = NULL; size_t vm_size, pgd_size; unsigned int nr_vcpus; + pkvm_handle_t handle; void *pgd = NULL; int ret; @@ -571,6 +825,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, goto err_unpin_kvm; } + handle = READ_ONCE(host_kvm->arch.pkvm.handle); + if (unlikely(handle < HANDLE_OFFSET)) { + ret = -EINVAL; + goto err_unpin_kvm; + } + vm_size = pkvm_get_hyp_vm_size(nr_vcpus); pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr); @@ -584,24 +844,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, if (!pgd) goto err_remove_mappings; - init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus); - - hyp_spin_lock(&vm_table_lock); - ret = insert_vm_table_entry(host_kvm, hyp_vm); - if (ret < 0) - goto err_unlock; + init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle); ret = kvm_guest_prepare_stage2(hyp_vm, pgd); if (ret) - goto err_remove_vm_table_entry; - hyp_spin_unlock(&vm_table_lock); + goto err_remove_mappings; - return hyp_vm->kvm.arch.pkvm.handle; + /* Must be called last since this publishes the VM. */ + ret = insert_vm_table_entry(handle, hyp_vm); + if (ret) + goto err_remove_mappings; + + return 0; -err_remove_vm_table_entry: - remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle); -err_unlock: - hyp_spin_unlock(&vm_table_lock); err_remove_mappings: unmap_donated_memory(hyp_vm, vm_size); unmap_donated_memory(pgd, pgd_size); @@ -611,22 +866,39 @@ err_unpin_kvm: } /* - * Initialize the hypervisor copy of the protected vCPU state using the - * memory donated by the host. + * Initialize the hypervisor copy of the vCPU state using host-donated memory. * - * handle: The handle for the protected vm. + * handle: The hypervisor handle for the vm. * host_vcpu: A pointer to the corresponding host vcpu. * vcpu_hva: The host va of the area being donated for the vcpu state. * Must be page aligned. The size of the area must be equal to * the page-aligned size of 'struct pkvm_hyp_vcpu'. * Return 0 on success, negative error code on failure. */ +static int register_hyp_vcpu(struct pkvm_hyp_vm *hyp_vm, + struct pkvm_hyp_vcpu *hyp_vcpu) +{ + unsigned int idx = hyp_vcpu->vcpu.vcpu_idx; + + if (idx >= hyp_vm->kvm.created_vcpus) + return -EINVAL; + + if (hyp_vm->vcpus[idx]) + return -EINVAL; + + /* + * Ensure the hyp_vcpu is initialised before publishing it to + * the vCPU-load path via 'hyp_vm->vcpus[]'. + */ + smp_store_release(&hyp_vm->vcpus[idx], hyp_vcpu); + return 0; +} + int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, unsigned long vcpu_hva) { struct pkvm_hyp_vcpu *hyp_vcpu; struct pkvm_hyp_vm *hyp_vm; - unsigned int idx; int ret; hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu)); @@ -641,27 +913,21 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, goto unlock; } - idx = hyp_vm->nr_vcpus; - if (idx >= hyp_vm->kvm.created_vcpus) { - ret = -EINVAL; - goto unlock; - } - - ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); if (ret) goto unlock; - hyp_vm->vcpus[idx] = hyp_vcpu; - hyp_vm->nr_vcpus++; + ret = register_hyp_vcpu(hyp_vm, hyp_vcpu); + if (ret) { + unpin_host_vcpu(host_vcpu); + unpin_host_sve_state(hyp_vcpu); + } unlock: hyp_spin_unlock(&vm_table_lock); - if (ret) { + if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); - return ret; - } - - return 0; + return ret; } static void @@ -676,9 +942,56 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) unmap_donated_memory_noclear(addr, size); } -int __pkvm_teardown_vm(pkvm_handle_t handle) +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + int ret = -EINVAL; + + if (!hyp_vm) + return ret; + + if (hyp_vm->kvm.arch.pkvm.is_dying) + ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm); + + put_pkvm_hyp_vm(hyp_vm); + return ret; +} + +static struct pkvm_hyp_vm *get_pkvm_unref_hyp_vm_locked(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_assert_lock_held(&vm_table_lock); + + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || hyp_page_count(hyp_vm)) + return NULL; + + return hyp_vm; +} + +int __pkvm_start_teardown_vm(pkvm_handle_t handle) { - struct kvm_hyp_memcache *mc; + struct pkvm_hyp_vm *hyp_vm; + int ret = 0; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_pkvm_unref_hyp_vm_locked(handle); + if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) { + ret = -EINVAL; + goto unlock; + } + + hyp_vm->kvm.arch.pkvm.is_dying = true; +unlock: + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle) +{ + struct kvm_hyp_memcache *mc, *stage2_mc; struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; unsigned int idx; @@ -686,14 +999,9 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) int err; hyp_spin_lock(&vm_table_lock); - hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm) { - err = -ENOENT; - goto err_unlock; - } - - if (WARN_ON(hyp_page_count(hyp_vm))) { - err = -EBUSY; + hyp_vm = get_pkvm_unref_hyp_vm_locked(handle); + if (!hyp_vm || !hyp_vm->kvm.arch.pkvm.is_dying) { + err = -EINVAL; goto err_unlock; } @@ -706,18 +1014,24 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Reclaim guest pages (including page-table pages) */ mc = &host_kvm->arch.pkvm.teardown_mc; - reclaim_guest_pages(hyp_vm, mc); - unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; + reclaim_pgtable_pages(hyp_vm, stage2_mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); /* Push the metadata pages to the teardown memcache */ - for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; - struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + struct kvm_hyp_memcache *vcpu_mc; + + if (!hyp_vcpu) + continue; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; while (vcpu_mc->nr_pages) { void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); - push_hyp_memcache(mc, addr, hyp_virt_to_phys); + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); unmap_donated_memory_noclear(addr, PAGE_SIZE); } @@ -733,3 +1047,121 @@ err_unlock: hyp_spin_unlock(&vm_table_lock); return err; } + +static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa) +{ + u64 elr; + + /* Fake up a data abort (level 3 translation fault on write) */ + vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) | + ESR_ELx_WNR | ESR_ELx_FSC_FAULT | + FIELD_PREP(ESR_ELx_FSC_LEVEL, 3); + + /* Shuffle the IPA around into the HPFAR */ + vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK; + + /* This is a virtual address. 0's good. Let's go with 0. */ + vcpu->arch.fault.far_el2 = 0; + + /* Rewind the ELR so we return to the HVC once the IPA is mapped */ + elr = read_sysreg(elr_el2); + elr -= 4; + write_sysreg(elr, elr_el2); + + return ARM_EXCEPTION_TRAP; +} + +static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + u64 ipa = smccc_get_arg1(vcpu); + + if (!PAGE_ALIGNED(ipa)) + goto out_guest; + + hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); + switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) { + case 0: + ret[0] = SMCCC_RET_SUCCESS; + goto out_guest; + case -ENOENT: + /* + * Convert the exception into a data abort so that the page + * being shared is mapped into the guest next time. + */ + *exit_code = __pkvm_memshare_page_req(vcpu, ipa); + goto out_host; + } + +out_guest: + return true; +out_host: + return false; +} + +static void pkvm_memunshare_call(u64 *ret, struct kvm_vcpu *vcpu) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + u64 ipa = smccc_get_arg1(vcpu); + + if (!PAGE_ALIGNED(ipa)) + return; + + hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); + if (!__pkvm_guest_unshare_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) + ret[0] = SMCCC_RET_SUCCESS; +} + +/* + * Handler for protected VM HVC calls. + * + * Returns true if the hypervisor has handled the exit (and control + * should return to the guest) or false if it hasn't (and the handling + * should be performed by the host). + */ +bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 val[4] = { SMCCC_RET_INVALID_PARAMETER }; + bool handled = true; + + switch (smccc_get_function(vcpu)) { + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID: + if (smccc_get_arg1(vcpu) || + smccc_get_arg2(vcpu) || + smccc_get_arg3(vcpu)) { + break; + } + + val[0] = PAGE_SIZE; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + if (smccc_get_arg2(vcpu) || + smccc_get_arg3(vcpu)) { + break; + } + + handled = pkvm_memshare_call(val, vcpu, exit_code); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + if (smccc_get_arg2(vcpu) || + smccc_get_arg3(vcpu)) { + break; + } + + pkvm_memunshare_call(val, vcpu); + break; + default: + /* Punt everything else back to the host, for now. */ + handled = false; + } + + if (handled) + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); + return handled; +} diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index c3e196fb8b18..e20db999e328 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -6,11 +6,12 @@ #include <asm/kvm_asm.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_hypevents.h> #include <asm/kvm_mmu.h> -#include <linux/arm-smccc.h> #include <linux/kvm_host.h> #include <uapi/linux/psci.h> +#include <nvhe/arm-smccc.h> #include <nvhe/memory.h> #include <nvhe/trap_handler.h> @@ -65,7 +66,7 @@ static unsigned long psci_call(unsigned long fn, unsigned long arg0, { struct arm_smccc_res res; - arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res); + hyp_smccc_1_1_smc(fn, arg0, arg1, arg2, &res); return res.a0; } @@ -200,30 +201,42 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) __hyp_pa(init_params), 0); } -asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) +static void __noreturn __kvm_host_psci_cpu_entry(unsigned long pc, unsigned long r0) { - struct psci_boot_args *boot_args; - struct kvm_cpu_context *host_ctxt; - - host_ctxt = host_data_ptr(host_ctxt); + struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt); - if (is_cpu_on) - boot_args = this_cpu_ptr(&cpu_on_args); - else - boot_args = this_cpu_ptr(&suspend_args); + trace_hyp_enter(host_ctxt, HYP_REASON_PSCI); - cpu_reg(host_ctxt, 0) = boot_args->r0; - write_sysreg_el2(boot_args->pc, SYS_ELR); - - if (is_cpu_on) - release_boot_args(boot_args); + cpu_reg(host_ctxt, 0) = r0; + write_sysreg_el2(pc, SYS_ELR); write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + trace_hyp_exit(host_ctxt, HYP_REASON_PSCI); __host_enter(host_ctxt); } +asmlinkage void __noreturn __kvm_host_psci_cpu_on_entry(void) +{ + struct psci_boot_args *boot_args = this_cpu_ptr(&cpu_on_args); + unsigned long pc, r0; + + pc = READ_ONCE(boot_args->pc); + r0 = READ_ONCE(boot_args->r0); + + release_boot_args(boot_args); + + __kvm_host_psci_cpu_entry(pc, r0); +} + +asmlinkage void __noreturn __kvm_host_psci_cpu_resume_entry(void) +{ + struct psci_boot_args *boot_args = this_cpu_ptr(&suspend_args); + + __kvm_host_psci_cpu_entry(boot_args->pc, boot_args->r0); +} + static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id)) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index d62bcb5634a2..d461981616d9 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -28,6 +28,7 @@ static void *vmemmap_base; static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *selftest_base; static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static struct hyp_pool hpool; @@ -38,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size) hyp_early_alloc_init(virt, size); + nr_pages = pkvm_selftest_pages(); + selftest_base = hyp_early_alloc_contig(nr_pages); + if (nr_pages && !selftest_base) + return -ENOMEM; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) @@ -119,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; + ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP); + if (ret) + return ret; + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -180,7 +190,9 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { enum pkvm_page_state state; + struct hyp_page *page; phys_addr_t phys; + enum kvm_pgtable_prot prot; if (!kvm_pte_valid(ctx->old)) return 0; @@ -192,19 +204,32 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!addr_is_memory(phys)) return -EINVAL; + page = hyp_phys_to_page(phys); + /* * Adjust the host stage-2 mappings to match the ownership attributes - * configured in the hypervisor stage-1. + * configured in the hypervisor stage-1, and make sure to propagate them + * to the hyp_vmemmap state. */ - state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); + prot = kvm_pgtable_hyp_pte_prot(ctx->old); + state = pkvm_getstate(prot); switch (state) { case PKVM_PAGE_OWNED: - return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + set_hyp_state(page, PKVM_PAGE_OWNED); + /* hyp text is RO in the host stage-2 to be inspected on panic. */ + if (prot == PAGE_HYP_EXEC) { + set_host_state(page, PKVM_NOPAGE); + return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R); + } else { + return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); + } case PKVM_PAGE_SHARED_OWNED: - hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED; + set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); + set_host_state(page, PKVM_PAGE_SHARED_BORROWED); break; case PKVM_PAGE_SHARED_BORROWED: - hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED; + set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED); + set_host_state(page, PKVM_PAGE_SHARED_OWNED); break; default: return -EINVAL; @@ -287,15 +312,15 @@ void __noreturn __pkvm_init_finalise(void) }; pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; - ret = fix_host_ownership(); + ret = fix_hyp_pgtable_refcnt(); if (ret) goto out; - ret = fix_hyp_pgtable_refcnt(); + ret = hyp_create_fixmap(); if (ret) goto out; - ret = hyp_create_pcpu_fixmap(); + ret = fix_host_ownership(); if (ret) goto out; @@ -304,6 +329,8 @@ void __noreturn __pkvm_init_finalise(void) goto out; pkvm_hyp_vm_table_init(vm_table_base); + + pkvm_ownership_selftest(selftest_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, @@ -314,8 +341,7 @@ out: __host_enter(host_ctxt); } -int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, - unsigned long *per_cpu_base, u32 hyp_va_bits) +int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params; void *virt = hyp_phys_to_virt(phys); @@ -328,7 +354,6 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, return -EINVAL; hyp_spin_lock_init(&pkvm_pgd_lock); - hyp_nr_cpus = nr_cpus; ret = divide_memory_pool(virt, size); if (ret) diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index 5b6eeab1a774..7c832d60d22b 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -34,7 +34,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) stacktrace_info->pc = pc; } -#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#ifdef CONFIG_PKVM_STACKTRACE #include <asm/stacktrace/nvhe.h> DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); @@ -134,11 +134,11 @@ static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) unwind(&state, pkvm_save_backtrace_entry, &idx); } -#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +#else /* !CONFIG_PKVM_STACKTRACE */ static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) { } -#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ +#endif /* CONFIG_PKVM_STACKTRACE */ /* * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 7d2ba6ef0261..8d1df3d33595 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -7,7 +7,6 @@ #include <hyp/switch.h> #include <hyp/sysreg-sr.h> -#include <linux/arm-smccc.h> #include <linux/kvm_host.h> #include <linux/types.h> #include <linux/jump_label.h> @@ -21,6 +20,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_hypevents.h> #include <asm/kvm_mmu.h> #include <asm/fpsimd.h> #include <asm/debug-monitors.h> @@ -33,70 +33,30 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); -extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); - -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); - - if (has_hvhe()) { - val |= CPACR_EL1_TTA; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } - - write_sysreg(val, cpacr_el1); - } else { - val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; - - /* - * Always trap SME since it's not supported in KVM. - * TSM is RES1 if SME isn't implemented. - */ - val |= CPTR_EL2_TSM; - - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - - if (!guest_owns_fp_regs()) - val |= CPTR_EL2_TFP; +struct fgt_masks hfgrtr_masks; +struct fgt_masks hfgwtr_masks; +struct fgt_masks hfgitr_masks; +struct fgt_masks hdfgrtr_masks; +struct fgt_masks hdfgwtr_masks; +struct fgt_masks hafgrtr_masks; +struct fgt_masks hfgrtr2_masks; +struct fgt_masks hfgwtr2_masks; +struct fgt_masks hfgitr2_masks; +struct fgt_masks hdfgrtr2_masks; +struct fgt_masks hdfgwtr2_masks; +struct fgt_masks ich_hfgrtr_masks; +struct fgt_masks ich_hfgwtr_masks; +struct fgt_masks ich_hfgitr_masks; - write_sysreg(val, cptr_el2); - } -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - if (has_hvhe()) { - u64 val = CPACR_EL1_FPEN; - - if (cpus_have_final_cap(ARM64_SVE)) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - - write_sysreg(val, cpacr_el1); - } else { - u64 val = CPTR_NVHE_EL2_RES1; - - if (!cpus_have_final_cap(ARM64_SVE)) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - - write_sysreg(val, cptr_el2); - } -} +extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); + + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + __activate_traps_common(vcpu); __activate_cptr_traps(vcpu); @@ -140,9 +100,11 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); + __deactivate_traps_common(vcpu); - write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); + write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); @@ -151,6 +113,12 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) /* Save VGICv3 state on non-VHE systems */ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) { + if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) { + __vgic_v5_save_state(&vcpu->arch.vgic_cpu.vgic_v5); + __vgic_v5_save_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5); + return; + } + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); @@ -160,6 +128,12 @@ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) /* Restore VGICv3 state on non-VHE systems */ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { + if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) { + __vgic_v5_restore_state(&vcpu->arch.vgic_cpu.vgic_v5); + __vgic_v5_restore_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5); + return; + } + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); @@ -231,6 +205,7 @@ static const exit_handler_fn hyp_exit_handlers[] = { static const exit_handler_fn pvm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64, [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, @@ -252,7 +227,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - synchronize_vcpu_pstate(vcpu, exit_code); + synchronize_vcpu_pstate(vcpu); /* * Some guests (e.g., protected VMs) are not be allowed to run in @@ -319,7 +294,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) * We're about to restore some new MMU state. Make sure * ongoing page-table walks that have started before we * trapped to EL2 have completed. This also synchronises the - * above disabling of SPE and TRBE. + * above disabling of BRBE. * * See DDI0487I.a D8.1.5 "Out-of-context translation regimes", * rule R_LFHQG and subsequent information statements. @@ -349,10 +324,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __debug_switch_to_guest(vcpu); do { + trace_hyp_exit(host_ctxt, HYP_REASON_ERET_GUEST); + /* Jump in the fire! */ exit_code = __guest_enter(vcpu); /* And we're baaack! */ + trace_hyp_enter(host_ctxt, HYP_REASON_GUEST_EXIT); } while (fixup_guest_exit(vcpu, &exit_code)); __sysreg_save_state_nvhe(guest_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 1ddd9ed3cbb3..8c3fbb413a06 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -20,6 +20,7 @@ */ u64 id_aa64pfr0_el1_sys_val; u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64pfr2_el1_sys_val; u64 id_aa64isar0_el1_sys_val; u64 id_aa64isar1_el1_sys_val; u64 id_aa64isar2_el1_sys_val; @@ -108,6 +109,11 @@ static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { FEAT_END }; +static const struct pvm_ftr_bits pvmid_aa64pfr2[] = { + MAX_FEAT(ID_AA64PFR2_EL1, GCIE, NI), + FEAT_END +}; + static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), @@ -134,7 +140,7 @@ static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), - MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, IDS, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), @@ -221,6 +227,8 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); + case SYS_ID_AA64PFR2_EL1: + return get_restricted_features(vcpu, id_aa64pfr2_el1_sys_val, pvmid_aa64pfr2); case SYS_ID_AA64ISAR0_EL1: return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: @@ -243,17 +251,17 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) } } -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) +static void inject_sync64(struct kvm_vcpu *vcpu, u64 esr) { - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + /* + * Make sure we have the latest update to VBAR_EL1, as pKVM + * handles traps very early, before sysregs are resync'ed + */ + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); __kvm_adjust_pc(vcpu); @@ -264,6 +272,15 @@ static void inject_undef64(struct kvm_vcpu *vcpu) write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); } +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + inject_sync64(vcpu, (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT)); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { @@ -338,6 +355,18 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, return true; } +static bool pvm_idst_access(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, IDS, IMP)) + inject_sync64(vcpu, kvm_vcpu_get_esr(vcpu)); + else + inject_undef64(vcpu); + + return false; +} + /* Mark the specified system register as an AArch32 feature id register. */ #define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } @@ -371,6 +400,17 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Cache maintenance by set/way operations are restricted. */ /* Debug and Trace Registers are restricted. */ + RAZ_WI(SYS_DBGBVRn_EL1(0)), + RAZ_WI(SYS_DBGBCRn_EL1(0)), + RAZ_WI(SYS_DBGWVRn_EL1(0)), + RAZ_WI(SYS_DBGWCRn_EL1(0)), + RAZ_WI(SYS_MDSCR_EL1), + RAZ_WI(SYS_OSLAR_EL1), + RAZ_WI(SYS_OSLSR_EL1), + RAZ_WI(SYS_OSDLR_EL1), + + /* Group 1 ID registers */ + HOST_HANDLED(SYS_REVIDR_EL1), /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ @@ -407,7 +447,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* CRm=4 */ AARCH64(SYS_ID_AA64PFR0_EL1), AARCH64(SYS_ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + AARCH64(SYS_ID_AA64PFR2_EL1), ID_UNALLOCATED(4,3), AARCH64(SYS_ID_AA64ZFR0_EL1), ID_UNALLOCATED(4,5), @@ -440,6 +480,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Scalable Vector Registers are restricted. */ + HOST_HANDLED(SYS_ICC_PMR_EL1), + RAZ_WI(SYS_ERRIDR_EL1), RAZ_WI(SYS_ERRSELR_EL1), RAZ_WI(SYS_ERXFR_EL1), @@ -453,13 +495,20 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Limited Ordering Regions Registers are restricted. */ + HOST_HANDLED(SYS_ICC_DIR_EL1), + HOST_HANDLED(SYS_ICC_RPR_EL1), HOST_HANDLED(SYS_ICC_SGI1R_EL1), HOST_HANDLED(SYS_ICC_ASGI1R_EL1), HOST_HANDLED(SYS_ICC_SGI0R_EL1), + HOST_HANDLED(SYS_ICC_CTLR_EL1), { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, HOST_HANDLED(SYS_CCSIDR_EL1), HOST_HANDLED(SYS_CLIDR_EL1), + { SYS_DESC(SYS_CCSIDR2_EL1), .access = pvm_idst_access }, + { SYS_DESC(SYS_GMID_EL1), .access = pvm_idst_access }, + { SYS_DESC(SYS_SMIDR_EL1), .access = pvm_idst_access }, + HOST_HANDLED(SYS_AIDR_EL1), HOST_HANDLED(SYS_CSSELR_EL1), HOST_HANDLED(SYS_CTR_EL0), diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index dba101565de3..3cc613cce5f5 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1)); + u64 midr = ctxt_midr_el1(ctxt); + + __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 48da9ca9763f..b29140995d48 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* @@ -169,7 +168,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* @@ -226,7 +224,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -240,7 +238,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -266,5 +264,5 @@ void __kvm_flush_vm_context(void) /* Same remark as in enter_vmid_context() */ dsb(ish); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c new file mode 100644 index 000000000000..e7e150ab265f --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/trace.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <nvhe/clock.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> +#include <nvhe/trace.h> + +#include <asm/percpu.h> +#include <asm/kvm_mmu.h> +#include <asm/local.h> + +#include "simple_ring_buffer.c" + +static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs); + +static struct hyp_trace_buffer { + struct simple_rb_per_cpu __percpu *simple_rbs; + void *bpages_backing_start; + size_t bpages_backing_size; + hyp_spinlock_t lock; +} trace_buffer = { + .simple_rbs = &__simple_rbs, + .lock = __HYP_SPIN_LOCK_UNLOCKED, +}; + +static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer) +{ + return trace_buffer->bpages_backing_size > 0; +} + +void *tracing_reserve_entry(unsigned long length) +{ + return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length, + trace_clock()); +} + +void tracing_commit_entry(void) +{ + simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs)); +} + +static int __admit_host_mem(void *start, u64 size) +{ + if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size) || !size) + return -EINVAL; + + if (!is_protected_kvm_enabled()) + return 0; + + return __pkvm_host_donate_hyp(hyp_virt_to_pfn(start), size >> PAGE_SHIFT); +} + +static void __release_host_mem(void *start, u64 size) +{ + if (!is_protected_kvm_enabled()) + return; + + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT)); +} + +static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer, + struct hyp_trace_desc *desc) +{ + void *start = (void *)kern_hyp_va(desc->bpages_backing_start); + size_t size = desc->bpages_backing_size; + int ret; + + ret = __admit_host_mem(start, size); + if (ret) + return ret; + + memset(start, 0, size); + + trace_buffer->bpages_backing_start = start; + trace_buffer->bpages_backing_size = size; + + return 0; +} + +static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer) +{ + void *start = trace_buffer->bpages_backing_start; + size_t size = trace_buffer->bpages_backing_size; + + if (!size) + return; + + memset(start, 0, size); + + __release_host_mem(start, size); + + trace_buffer->bpages_backing_start = 0; + trace_buffer->bpages_backing_size = 0; +} + +static void *__pin_shared_page(unsigned long kern_va) +{ + void *va = kern_hyp_va((void *)kern_va); + + if (!is_protected_kvm_enabled()) + return va; + + return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va; +} + +static void __unpin_shared_page(void *va) +{ + if (!is_protected_kvm_enabled()) + return; + + hyp_unpin_shared_mem(va, va + PAGE_SIZE); +} + +static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer) +{ + int cpu; + + hyp_assert_lock_held(&trace_buffer->lock); + + if (!hyp_trace_buffer_loaded(trace_buffer)) + return; + + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) + simple_ring_buffer_unload_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu), + __unpin_shared_page); + + hyp_trace_buffer_unload_bpage_backing(trace_buffer); +} + +static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer, + struct hyp_trace_desc *desc) +{ + struct simple_buffer_page *bpages; + struct ring_buffer_desc *rb_desc; + int ret, cpu; + + hyp_assert_lock_held(&trace_buffer->lock); + + if (hyp_trace_buffer_loaded(trace_buffer)) + return -EINVAL; + + ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc); + if (ret) + return ret; + + bpages = trace_buffer->bpages_backing_start; + for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) { + ret = simple_ring_buffer_init_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu), + bpages, rb_desc, __pin_shared_page, + __unpin_shared_page); + if (ret) + break; + + bpages += rb_desc->nr_page_va; + } + + if (ret) + hyp_trace_buffer_unload(trace_buffer); + + return ret; +} + +static bool hyp_trace_desc_is_valid(struct hyp_trace_desc *desc, size_t desc_size) +{ + struct ring_buffer_desc *rb_desc; + unsigned int cpu; + size_t nr_bpages; + void *desc_end; + + if (!is_protected_kvm_enabled()) + return true; + + /* + * Both desc_size and bpages_backing_size are untrusted host-provided + * values. We rely on __pkvm_host_donate_hyp() to enforce their validity. + */ + desc_end = (void *)desc + desc_size; + nr_bpages = desc->bpages_backing_size / sizeof(struct simple_buffer_page); + + for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) { + /* Can we read nr_page_va? */ + if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end) + return false; + + /* Overflow desc? */ + if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end) + return false; + + /* Overflow bpages backing memory? */ + if (nr_bpages < rb_desc->nr_page_va) + return false; + + if (cpu >= hyp_nr_cpus) + return false; + + if (cpu != rb_desc->cpu) + return false; + + nr_bpages -= rb_desc->nr_page_va; + } + + return true; +} + +int __tracing_load(unsigned long desc_hva, size_t desc_size) +{ + struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva); + int ret; + + ret = __admit_host_mem(desc, desc_size); + if (ret) + return ret; + + if (!hyp_trace_desc_is_valid(desc, desc_size)) { + ret = -EINVAL; + goto err_release_desc; + } + + hyp_spin_lock(&trace_buffer.lock); + + ret = hyp_trace_buffer_load(&trace_buffer, desc); + + hyp_spin_unlock(&trace_buffer.lock); + +err_release_desc: + __release_host_mem(desc, desc_size); + return ret; +} + +void __tracing_unload(void) +{ + hyp_spin_lock(&trace_buffer.lock); + hyp_trace_buffer_unload(&trace_buffer); + hyp_spin_unlock(&trace_buffer.lock); +} + +int __tracing_enable(bool enable) +{ + int cpu, ret = enable ? -EINVAL : 0; + + hyp_spin_lock(&trace_buffer.lock); + + if (!hyp_trace_buffer_loaded(&trace_buffer)) + goto unlock; + + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) + simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu), + enable); + + ret = 0; + +unlock: + hyp_spin_unlock(&trace_buffer.lock); + + return ret; +} + +int __tracing_swap_reader(unsigned int cpu) +{ + int ret = -ENODEV; + + if (cpu >= hyp_nr_cpus) + return -EINVAL; + + hyp_spin_lock(&trace_buffer.lock); + + if (hyp_trace_buffer_loaded(&trace_buffer)) + ret = simple_ring_buffer_swap_reader_page( + per_cpu_ptr(trace_buffer.simple_rbs, cpu)); + + hyp_spin_unlock(&trace_buffer.lock); + + return ret; +} + +void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) +{ + int cpu; + + /* After this loop, all CPUs are observing the new bank... */ + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) { + struct simple_rb_per_cpu *simple_rb = per_cpu_ptr(trace_buffer.simple_rbs, cpu); + + while (READ_ONCE(simple_rb->status) == SIMPLE_RB_WRITING) + ; + } + + /* ...we can now override the old one and swap. */ + trace_clock_update(mult, shift, epoch_ns, epoch_cyc); +} + +int __tracing_reset(unsigned int cpu) +{ + int ret = -ENODEV; + + if (cpu >= hyp_nr_cpus) + return -EINVAL; + + hyp_spin_lock(&trace_buffer.lock); + + if (hyp_trace_buffer_loaded(&trace_buffer)) + ret = simple_ring_buffer_reset(per_cpu_ptr(trace_buffer.simple_rbs, cpu)); + + hyp_spin_unlock(&trace_buffer.lock); + + return ret; +} diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index df5cc74a7dd0..0c1defa5fb0f 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,12 +11,6 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> - -#define KVM_PTE_TYPE BIT(1) -#define KVM_PTE_TYPE_BLOCK 0 -#define KVM_PTE_TYPE_PAGE 1 -#define KVM_PTE_TYPE_TABLE 1 - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -120,11 +114,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) return pte; } -static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) -{ - return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); -} - static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) @@ -150,7 +139,7 @@ static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, * page table walk. */ if (r == -EAGAIN) - return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); + return walker->flags & KVM_PGTABLE_WALK_IGNORE_EAGAIN; return !r; } @@ -348,6 +337,9 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (!(prot & KVM_PGTABLE_PROT_R)) return -EINVAL; + if (!cpus_have_final_cap(ARM64_KVM_HVHE)) + prot &= ~KVM_PGTABLE_PROT_UX; + if (prot & KVM_PGTABLE_PROT_X) { if (prot & KVM_PGTABLE_PROT_W) return -EINVAL; @@ -357,8 +349,16 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (system_supports_bti_kernel()) attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; + } + + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + if (!(prot & KVM_PGTABLE_PROT_PX)) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_PXN; + if (!(prot & KVM_PGTABLE_PROT_UX)) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_UXN; } else { - attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; + if (!(prot & KVM_PGTABLE_PROT_PX)) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); @@ -379,8 +379,15 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) if (!kvm_pte_valid(pte)) return prot; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) - prot |= KVM_PGTABLE_PROT_X; + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_PXN)) + prot |= KVM_PGTABLE_PROT_PX; + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_UXN)) + prot |= KVM_PGTABLE_PROT_UX; + } else { + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_PX; + } ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) @@ -478,18 +485,18 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); + __tlbi_level(vae2is, ctx->addr, TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vale2is, ctx->addr, ctx->level); *unmapped += granule; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); mm_ops->put_page(ctx->ptep); @@ -569,7 +576,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) struct stage2_map_data { const u64 phys; kvm_pte_t attr; - u8 owner_id; + kvm_pte_t pte_annot; kvm_pte_t *anchor; kvm_pte_t *childp; @@ -589,8 +596,8 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) u64 vtcr = VTCR_EL2_FLAGS; s8 lvls; - vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; - vtcr |= VTCR_EL2_T0SZ(phys_shift); + vtcr |= FIELD_PREP(VTCR_EL2_PS, kvm_get_parange(mmfr0)); + vtcr |= FIELD_PREP(VTCR_EL2_T0SZ, (UL(64) - phys_shift)); /* * Use a minimum 2 level page table to prevent splitting * host PMD huge pages at stage2. @@ -630,21 +637,11 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) vtcr |= VTCR_EL2_DS; /* Set the vmid bits */ - vtcr |= (get_vmid_bits(mmfr1) == 16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; + vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS : 0; return vtcr; } -static bool stage2_has_fwb(struct kvm_pgtable *pgt) -{ - if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) - return false; - - return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); -} - void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, size_t size) { @@ -665,13 +662,49 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, } } -#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) +#define KVM_S2_MEMATTR(pgt, attr) \ + ({ \ + kvm_pte_t __attr; \ + \ + if ((pgt)->flags & KVM_PGTABLE_S2_AS_S1) \ + __attr = PAGE_S2_MEMATTR(AS_S1); \ + else \ + __attr = PAGE_S2_MEMATTR(attr); \ + \ + __attr; \ + }) + +static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) +{ + bool px, ux; + u8 xn; + + px = prot & KVM_PGTABLE_PROT_PX; + ux = prot & KVM_PGTABLE_PROT_UX; + + if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux) + return -EINVAL; + + if (px && ux) + xn = 0b00; + else if (!px && ux) + xn = 0b01; + else if (!px && !ux) + xn = 0b10; + else + xn = 0b11; + + *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN; + *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn); + return 0; +} static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + int r; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { @@ -691,8 +724,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr = KVM_S2_MEMATTR(pgt, NORMAL); } - if (!(prot & KVM_PGTABLE_PROT_X)) - attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + r = stage2_set_xn_attr(prot, &attr); + if (r) + return r; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -721,8 +755,20 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) - prot |= KVM_PGTABLE_PROT_X; + + switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) { + case 0b00: + prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX; + break; + case 0b01: + prot |= KVM_PGTABLE_PROT_UX; + break; + case 0b11: + prot |= KVM_PGTABLE_PROT_PX; + break; + default: + break; + } return prot; } @@ -747,7 +793,11 @@ static bool stage2_pte_is_counted(kvm_pte_t pte) static bool stage2_pte_is_locked(kvm_pte_t pte) { - return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); + if (kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) == + KVM_INVALID_PTE_TYPE_LOCKED; } static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) @@ -778,6 +828,7 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t locked_pte; if (stage2_pte_is_locked(ctx->old)) { /* @@ -788,7 +839,9 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, return false; } - if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) + locked_pte = FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK, + KVM_INVALID_PTE_TYPE_LOCKED); + if (!stage2_try_set_pte(ctx, locked_pte)) return false; if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { @@ -835,7 +888,7 @@ static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) * system supporting FWB as the optimization is entirely * pointless when the unmap walker needs to perform CMOs. */ - return system_supports_tlb_range() && stage2_has_fwb(pgt); + return system_supports_tlb_range() && cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); } static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, @@ -913,7 +966,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else - new = kvm_init_invalid_leaf_owner(data->owner_id); + new = data->pte_annot; /* * Skip updating the PTE if we are trying to recreate the exact @@ -1067,16 +1120,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, - void *mc, u8 owner_id) +int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, enum kvm_invalid_pte_type type, + kvm_pte_t pte_annot) { int ret; struct stage2_map_data map_data = { .mmu = pgt->mmu, .memcache = mc, - .owner_id = owner_id, .force_pte = true, .annotation = true, + .pte_annot = pte_annot | + FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK, type), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -1085,7 +1140,10 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .arg = &map_data, }; - if (owner_id > KVM_MAX_OWNER_ID) + if (pte_annot & ~KVM_INVALID_PTE_ANNOT_MASK) + return -EINVAL; + + if (!type || type == KVM_INVALID_PTE_TYPE_LOCKED) return -EINVAL; ret = kvm_pgtable_walk(pgt, addr, size, &walker); @@ -1115,7 +1173,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, if (mm_ops->page_count(childp) != 1) return 0; } else if (stage2_pte_cacheable(pgt, ctx->old)) { - need_flush = !stage2_has_fwb(pgt); + need_flush = !cpus_have_final_cap(ARM64_HAS_STAGE2_FWB); } /* @@ -1229,7 +1287,8 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, - NULL, NULL, 0); + NULL, NULL, + KVM_PGTABLE_WALK_IGNORE_EAGAIN); } void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, @@ -1296,9 +1355,9 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { - int ret; + kvm_pte_t xn = 0, set = 0, clr = 0; s8 level; - kvm_pte_t set = 0, clr = 0; + int ret; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; @@ -1309,8 +1368,12 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - if (prot & KVM_PGTABLE_PROT_X) - clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + ret = stage2_set_xn_attr(prot, &xn); + if (ret) + return ret; + + set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) @@ -1341,7 +1404,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) .arg = pgt, }; - if (stage2_has_fwb(pgt)) + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return 0; return kvm_pgtable_walk(pgt, addr, size, &walker); @@ -1541,37 +1604,80 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } -static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, - enum kvm_pgtable_walk_flags visit) +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; - if (!stage2_pte_is_counted(ctx->old)) + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) return 0; + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} - if (kvm_pte_table(ctx->old, ctx->level)) - mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + if (!stage2_pte_is_counted(ctx->old)) + return 0; - return 0; + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); pgt->pgd = NULL; } +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 87a54375bd6e..f0605836821e 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu) if (vcpu_mode_is_32bit(vcpu)) return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT); - return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE); + return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE); } /* @@ -44,7 +44,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) /* Build the full address */ fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu)); /* If not for GICV, move on */ if (fault_ipa < vgic->vgic_cpu_base || @@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return -1; } + /* Handle deactivation as a normal exit */ + if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE) + return 0; + rd = kvm_vcpu_dabt_get_rd(vcpu); addr = kvm_vgic_global_state.vcpu_hyp_va; addr += fault_ipa - vgic->vgic_cpu_base; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 3f9741e51d41..c4d2f1feea8b 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -14,11 +14,13 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include "../../vgic/vgic.h" + #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) -static u64 __gic_v3_get_lr(unsigned int lr) +u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: @@ -58,7 +60,7 @@ static u64 __gic_v3_get_lr(unsigned int lr) unreachable(); } -static void __gic_v3_set_lr(u64 val, int lr) +void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: @@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n) return val; } +static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if) +{ + return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits(); +} + void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; @@ -212,14 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } - if (used_lrs || cpu_if->its_vpe.its_vm) { + if (used_lrs) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; @@ -229,6 +234,23 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) __gic_v3_set_lr(0, i); } } + + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + + write_gicreg(0, ICH_HCR_EL2); + + /* + * Hack alert: On NV, this results in a trap so that the above write + * actually takes effect... No synchronisation is necessary, as we + * only care about the effects when this traps. + */ + read_gicreg(ICH_MISR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) @@ -236,12 +258,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) u64 used_lrs = cpu_if->used_lrs; int i; - if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Ensure that writes to the LRs, and on non-VHE systems ensure that @@ -274,7 +294,7 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); isb(); } else if (!cpu_if->vgic_sre) { @@ -295,40 +315,42 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } } - /* - * Prevent the guest from touching the ICC_SRE_EL1 system - * register. Note that this may not have any effect, as - * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); + /* Only disable SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { + /* + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. + */ + write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, + ICC_SRE_EL2); + } /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected. Note that this also applies if we don't expect - * any system register access (no vgic at all). + * If we need to trap system registers, we must write ICH_HCR_EL2 + * anyway, even if no interrupts are being injected. Note that this + * also applies if we don't expect any system register access (no + * vgic at all). In any case, no need to provide MI configuration. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); + /* Only restore SRE if the host implements the GICv2 interface */ + if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { + val = read_gicreg(ICC_SRE_EL2); + write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } } /* @@ -340,7 +362,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) write_gicreg(0, ICH_HCR_EL2); } -static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -423,29 +445,43 @@ void __vgic_v3_init_lrs(void) */ u64 __vgic_v3_get_gic_config(void) { - u64 val, sre = read_gicreg(ICC_SRE_EL1); + u64 val, sre; unsigned long flags = 0; /* - * To check whether we have a MMIO-based (GICv2 compatible) - * CPU interface, we need to disable the system register - * view. To do that safely, we have to prevent any interrupt - * from firing (which would be deadly). - * - * Note that this only makes sense on VHE, as interrupts are - * already masked for nVHE as part of the exception entry to - * EL2. + * In compat mode, we cannot access ICC_SRE_EL1 at any EL + * other than EL1 itself; just return the + * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5 + * system, so we first check if we have GICv5 support. */ - if (has_vhe()) - flags = local_daif_save(); + if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return read_gicreg(ICH_VTR_EL2); + sre = read_gicreg(ICC_SRE_EL1); /* + * To check whether we have a MMIO-based (GICv2 compatible) + * CPU interface, we need to disable the system register + * view. + * * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. + * + * As we always run VHE with HCR_xMO set, no extra xMO + * manipulation is required in that case. + * + * To safely disable SRE, we have to prevent any interrupt + * from firing (which would be deadly). This only makes sense + * on VHE, as interrupts are already masked for nVHE as part + * of the exception entry to EL2. */ - sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); - isb(); + if (has_vhe()) { + flags = local_daif_save(); + } else { + sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + } + write_gicreg(0, ICC_SRE_EL1); isb(); @@ -453,11 +489,13 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); - sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); - isb(); - if (has_vhe()) + if (has_vhe()) { local_daif_restore(flags); + } else { + sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + } val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); @@ -465,6 +503,16 @@ u64 __vgic_v3_get_gic_config(void) return val; } +static void __vgic_v3_compat_mode_enable(void) +{ + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3); + /* Wait for V3 to become enabled */ + isb(); +} + static u64 __vgic_v3_read_vmcr(void) { return read_gicreg(ICH_VMCR_EL2); @@ -475,15 +523,10 @@ static void __vgic_v3_write_vmcr(u32 vmcr) write_gicreg(vmcr, ICH_VMCR_EL2); } -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) -{ - __vgic_v3_save_aprs(cpu_if); - if (cpu_if->vgic_sre) - cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); -} - void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { + __vgic_v3_compat_mode_enable(); + /* * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen * is dependent on ICC_SRE_EL1.SRE, and we have to perform the @@ -526,11 +569,11 @@ static int __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, continue; /* Group-0 interrupt, but Group-0 disabled? */ - if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK)) + if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_EL2_VENG0_MASK)) continue; /* Group-1 interrupt, but Group-1 disabled? */ - if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK)) + if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_EL2_VENG1_MASK)) continue; /* Not the highest priority? */ @@ -603,19 +646,19 @@ static int __vgic_v3_get_highest_active_priority(void) static unsigned int __vgic_v3_get_bpr0(u32 vmcr) { - return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + return FIELD_GET(ICH_VMCR_EL2_VBPR0, vmcr); } static unsigned int __vgic_v3_get_bpr1(u32 vmcr) { unsigned int bpr; - if (vmcr & ICH_VMCR_CBPR_MASK) { + if (vmcr & ICH_VMCR_EL2_VCBPR_MASK) { bpr = __vgic_v3_get_bpr0(vmcr); if (bpr < 7) bpr++; } else { - bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + bpr = FIELD_GET(ICH_VMCR_EL2_VBPR1, vmcr); } return bpr; @@ -715,7 +758,7 @@ static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) if (grp != !!(lr_val & ICH_LR_GROUP)) goto spurious; - pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; + pmr = FIELD_GET(ICH_VMCR_EL2_VPMR, vmcr); lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; if (pmr <= lr_prio) goto spurious; @@ -752,31 +795,37 @@ static void __vgic_v3_bump_eoicount(void) u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; + hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } -static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; int lr; /* EOImode == 0, nothing to be done here */ - if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; + if (!(vmcr & ICH_VMCR_EL2_VEOIM_MASK)) + return 1; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) - return; + return 1; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; + if (lr != -1) { + __vgic_v3_clear_active_lr(lr, lr_val); + return 1; } - __vgic_v3_clear_active_lr(lr, lr_val); + return 0; +} + +static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + if (!___vgic_v3_write_dir(vcpu, vmcr, rt)) + __vgic_v3_bump_eoicount(); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) @@ -800,7 +849,7 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) } /* EOImode == 1 and not an LPI, nothing to be done here */ - if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + if ((vmcr & ICH_VMCR_EL2_VEOIM_MASK) && !(vid >= VGIC_MIN_LPI)) return; lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; @@ -816,22 +865,19 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) static void __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK)); + vcpu_set_reg(vcpu, rt, FIELD_GET(ICH_VMCR_EL2_VENG0, vmcr)); } static void __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { - vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK)); + vcpu_set_reg(vcpu, rt, FIELD_GET(ICH_VMCR_EL2_VENG1, vmcr)); } static void __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); - if (val & 1) - vmcr |= ICH_VMCR_ENG0_MASK; - else - vmcr &= ~ICH_VMCR_ENG0_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VENG0, &vmcr, val & 1); __vgic_v3_write_vmcr(vmcr); } @@ -840,10 +886,7 @@ static void __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u64 val = vcpu_get_reg(vcpu, rt); - if (val & 1) - vmcr |= ICH_VMCR_ENG1_MASK; - else - vmcr &= ~ICH_VMCR_ENG1_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VENG1, &vmcr, val & 1); __vgic_v3_write_vmcr(vmcr); } @@ -867,10 +910,7 @@ static void __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt) if (val < bpr_min) val = bpr_min; - val <<= ICH_VMCR_BPR0_SHIFT; - val &= ICH_VMCR_BPR0_MASK; - vmcr &= ~ICH_VMCR_BPR0_MASK; - vmcr |= val; + FIELD_MODIFY(ICH_VMCR_EL2_VBPR0, &vmcr, val); __vgic_v3_write_vmcr(vmcr); } @@ -880,17 +920,14 @@ static void __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt) u64 val = vcpu_get_reg(vcpu, rt); u8 bpr_min = __vgic_v3_bpr_min(); - if (vmcr & ICH_VMCR_CBPR_MASK) + if (FIELD_GET(ICH_VMCR_EL2_VCBPR, val)) return; /* Enforce BPR limiting */ if (val < bpr_min) val = bpr_min; - val <<= ICH_VMCR_BPR1_SHIFT; - val &= ICH_VMCR_BPR1_MASK; - vmcr &= ~ICH_VMCR_BPR1_MASK; - vmcr |= val; + FIELD_MODIFY(ICH_VMCR_EL2_VBPR1, &vmcr, val); __vgic_v3_write_vmcr(vmcr); } @@ -980,19 +1017,14 @@ spurious: static void __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { - vmcr &= ICH_VMCR_PMR_MASK; - vmcr >>= ICH_VMCR_PMR_SHIFT; - vcpu_set_reg(vcpu, rt, vmcr); + vcpu_set_reg(vcpu, rt, FIELD_GET(ICH_VMCR_EL2_VPMR, vmcr)); } static void __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); - val <<= ICH_VMCR_PMR_SHIFT; - val &= ICH_VMCR_PMR_MASK; - vmcr &= ~ICH_VMCR_PMR_MASK; - vmcr |= val; + FIELD_MODIFY(ICH_VMCR_EL2_VPMR, &vmcr, val); write_gicreg(vmcr, ICH_VMCR_EL2); } @@ -1015,9 +1047,11 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ - val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT; + val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, + FIELD_GET(ICH_VMCR_EL2_VEOIM, vmcr)); /* CBPR */ - val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; + val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, + FIELD_GET(ICH_VMCR_EL2_VCBPR, vmcr)); vcpu_set_reg(vcpu, rt, val); } @@ -1026,15 +1060,11 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 val = vcpu_get_reg(vcpu, rt); - if (val & ICC_CTLR_EL1_CBPR_MASK) - vmcr |= ICH_VMCR_CBPR_MASK; - else - vmcr &= ~ICH_VMCR_CBPR_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VCBPR, &vmcr, + FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val)); - if (val & ICC_CTLR_EL1_EOImode_MASK) - vmcr |= ICH_VMCR_EOIM_MASK; - else - vmcr &= ~ICH_VMCR_EOIM_MASK; + FIELD_MODIFY(ICH_VMCR_EL2_VEOIM, &vmcr, + FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val)); write_gicreg(vmcr, ICH_VMCR_EL2); } @@ -1044,7 +1074,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, { u64 ich_hcr; - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + if (!is_nested_ctxt(vcpu)) return false; ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); @@ -1052,11 +1082,11 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, switch (sysreg) { case SYS_ICC_IGRPEN0_EL1: if (is_read && - (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && - (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; @@ -1069,15 +1099,15 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_EOIR0_EL1: case SYS_ICC_HPPIR0_EL1: case SYS_ICC_IAR0_EL1: - return ich_hcr & ICH_HCR_TALL0; + return ich_hcr & ICH_HCR_EL2_TALL0; case SYS_ICC_IGRPEN1_EL1: if (is_read && - (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && - (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; @@ -1090,10 +1120,10 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_EOIR1_EL1: case SYS_ICC_HPPIR1_EL1: case SYS_ICC_IAR1_EL1: - return ich_hcr & ICH_HCR_TALL1; + return ich_hcr & ICH_HCR_EL2_TALL1; case SYS_ICC_DIR_EL1: - if (ich_hcr & ICH_HCR_TDIR) + if (ich_hcr & ICH_HCR_EL2_TDIR) return true; fallthrough; @@ -1101,7 +1131,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_RPR_EL1: case SYS_ICC_CTLR_EL1: case SYS_ICC_PMR_EL1: - return ich_hcr & ICH_HCR_TC; + return ich_hcr & ICH_HCR_EL2_TC; default: return false; @@ -1211,6 +1241,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; + /* + * Full exit if required to handle overflow deactivation, + * unless we can emulate it in the LRs (likely the majority + * of the cases). + */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) { + int ret; + + ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(), + kvm_vcpu_sys_get_rt(vcpu)); + if (ret) + __kvm_skip_instr(vcpu); + + return ret; + } fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: diff --git a/arch/arm64/kvm/hyp/vgic-v5-sr.c b/arch/arm64/kvm/hyp/vgic-v5-sr.c new file mode 100644 index 000000000000..47e6bcd43702 --- /dev/null +++ b/arch/arm64/kvm/hyp/vgic-v5-sr.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, 2026 - Arm Ltd + */ + +#include <linux/irqchip/arm-gic-v5.h> + +#include <asm/kvm_hyp.h> + +void __vgic_v5_save_apr(struct vgic_v5_cpu_if *cpu_if) +{ + cpu_if->vgic_apr = read_sysreg_s(SYS_ICH_APR_EL2); +} + +static void __vgic_v5_compat_mode_disable(void) +{ + sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, ICH_VCTLR_EL2_V3, 0); + isb(); +} + +void __vgic_v5_restore_vmcr_apr(struct vgic_v5_cpu_if *cpu_if) +{ + __vgic_v5_compat_mode_disable(); + + write_sysreg_s(cpu_if->vgic_vmcr, SYS_ICH_VMCR_EL2); + write_sysreg_s(cpu_if->vgic_apr, SYS_ICH_APR_EL2); +} + +void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if) +{ + /* + * The following code assumes that the bitmap storage that we have for + * PPIs is either 64 (architected PPIs, only) or 128 bits (architected & + * impdef PPIs). + */ + BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64); + + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit, + read_sysreg_s(SYS_ICH_PPI_ACTIVER0_EL2), 0, 64); + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr, + read_sysreg_s(SYS_ICH_PPI_PENDR0_EL2), 0, 64); + + cpu_if->vgic_ppi_priorityr[0] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR0_EL2); + cpu_if->vgic_ppi_priorityr[1] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR1_EL2); + cpu_if->vgic_ppi_priorityr[2] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR2_EL2); + cpu_if->vgic_ppi_priorityr[3] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR3_EL2); + cpu_if->vgic_ppi_priorityr[4] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR4_EL2); + cpu_if->vgic_ppi_priorityr[5] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR5_EL2); + cpu_if->vgic_ppi_priorityr[6] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR6_EL2); + cpu_if->vgic_ppi_priorityr[7] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR7_EL2); + + if (VGIC_V5_NR_PRIVATE_IRQS == 128) { + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit, + read_sysreg_s(SYS_ICH_PPI_ACTIVER1_EL2), 64, 64); + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr, + read_sysreg_s(SYS_ICH_PPI_PENDR1_EL2), 64, 64); + + cpu_if->vgic_ppi_priorityr[8] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR8_EL2); + cpu_if->vgic_ppi_priorityr[9] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR9_EL2); + cpu_if->vgic_ppi_priorityr[10] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR10_EL2); + cpu_if->vgic_ppi_priorityr[11] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR11_EL2); + cpu_if->vgic_ppi_priorityr[12] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR12_EL2); + cpu_if->vgic_ppi_priorityr[13] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR13_EL2); + cpu_if->vgic_ppi_priorityr[14] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR14_EL2); + cpu_if->vgic_ppi_priorityr[15] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR15_EL2); + } + + /* Now that we are done, disable DVI */ + write_sysreg_s(0, SYS_ICH_PPI_DVIR0_EL2); + write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); +} + +void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if) +{ + DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS); + + /* We assume 64 or 128 PPIs - see above comment */ + BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64); + + /* Enable DVI so that the guest's interrupt config takes over */ + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 0, 64), + SYS_ICH_PPI_DVIR0_EL2); + + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 0, 64), + SYS_ICH_PPI_ACTIVER0_EL2); + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 0, 64), + SYS_ICH_PPI_ENABLER0_EL2); + + /* Update the pending state of the NON-DVI'd PPIs, only */ + bitmap_andnot(pendr, host_data_ptr(vgic_v5_ppi_state)->pendr, + cpu_if->vgic_ppi_dvir, VGIC_V5_NR_PRIVATE_IRQS); + write_sysreg_s(bitmap_read(pendr, 0, 64), SYS_ICH_PPI_PENDR0_EL2); + + write_sysreg_s(cpu_if->vgic_ppi_priorityr[0], + SYS_ICH_PPI_PRIORITYR0_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[1], + SYS_ICH_PPI_PRIORITYR1_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[2], + SYS_ICH_PPI_PRIORITYR2_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[3], + SYS_ICH_PPI_PRIORITYR3_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[4], + SYS_ICH_PPI_PRIORITYR4_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[5], + SYS_ICH_PPI_PRIORITYR5_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[6], + SYS_ICH_PPI_PRIORITYR6_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[7], + SYS_ICH_PPI_PRIORITYR7_EL2); + + if (VGIC_V5_NR_PRIVATE_IRQS == 128) { + /* Enable DVI so that the guest's interrupt config takes over */ + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 64, 64), + SYS_ICH_PPI_DVIR1_EL2); + + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 64, 64), + SYS_ICH_PPI_ACTIVER1_EL2); + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 64, 64), + SYS_ICH_PPI_ENABLER1_EL2); + write_sysreg_s(bitmap_read(pendr, 64, 64), + SYS_ICH_PPI_PENDR1_EL2); + + write_sysreg_s(cpu_if->vgic_ppi_priorityr[8], + SYS_ICH_PPI_PRIORITYR8_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[9], + SYS_ICH_PPI_PRIORITYR9_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[10], + SYS_ICH_PPI_PRIORITYR10_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[11], + SYS_ICH_PPI_PRIORITYR11_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[12], + SYS_ICH_PPI_PRIORITYR12_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[13], + SYS_ICH_PPI_PRIORITYR13_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[14], + SYS_ICH_PPI_PRIORITYR14_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[15], + SYS_ICH_PPI_PRIORITYR15_EL2); + } else { + write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); + + write_sysreg_s(0, SYS_ICH_PPI_ACTIVER1_EL2); + write_sysreg_s(0, SYS_ICH_PPI_ENABLER1_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PENDR1_EL2); + + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR8_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR9_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR10_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR11_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR12_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR13_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR14_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR15_EL2); + } +} + +void __vgic_v5_save_state(struct vgic_v5_cpu_if *cpu_if) +{ + cpu_if->vgic_vmcr = read_sysreg_s(SYS_ICH_VMCR_EL2); + cpu_if->vgic_icsr = read_sysreg_s(SYS_ICC_ICSR_EL1); +} + +void __vgic_v5_restore_state(struct vgic_v5_cpu_if *cpu_if) +{ + write_sysreg_s(cpu_if->vgic_icsr, SYS_ICC_ICSR_EL1); +} diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index afc4aed9231a..9695328bbd96 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -10,4 +10,4 @@ CFLAGS_switch.o += -Wno-override-init obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o ../exception.o + ../fpsimd.o ../hyp-entry.o ../exception.o ../vgic-v5-sr.o diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 647737d6e8d0..1e8995add14f 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -43,107 +43,71 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); * * - API/APK: they are already accounted for by vcpu_load(), and can * only take effect across a load/put cycle (such as ERET) + * + * - FIEN: no way we let a guest have access to the RAS "Common Fault + * Injection" thing, whatever that does */ -#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK) +#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN) static u64 __compute_hcr(struct kvm_vcpu *vcpu) { - u64 hcr = vcpu->arch.hcr_el2; + u64 guest_hcr, hcr = vcpu->arch.hcr_el2; if (!vcpu_has_nv(vcpu)) return hcr; + /* + * We rely on the invariant that a vcpu entered from HYP + * context must also exit in the same context, as only an ERET + * instruction can kick us out of it, and we obviously trap + * that sucker. PSTATE.M will get fixed-up on exit. + */ if (is_hyp_ctxt(vcpu)) { + host_data_set_flag(VCPU_IN_HYP_CONTEXT); + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; if (!vcpu_el2_e2h_is_set(vcpu)) hcr |= HCR_NV1; - write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); - } - - return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); -} - -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 cptr; - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; + /* + * Nothing in HCR_EL2 should impact running in hypervisor + * context, apart from bits we have defined as RESx (E2H, + * HCD and co), or that cannot be set directly (the EXCLUDE + * bits). Given that we OR the guest's view with the host's, + * we can use the 0 value as the starting point, and only + * use the config-driven RES1 bits. + */ + guest_hcr = kvm_vcpu_apply_reg_masks(vcpu, HCR_EL2, 0); - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; + write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); } else { - __activate_traps_fpsimd32(vcpu); - } - - if (!vcpu_has_nv(vcpu)) - goto write; - - /* - * The architecture is a bit crap (what a surprise): an EL2 guest - * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, - * as they are RES0 in the guest's view. To work around it, trap the - * sucker using the very same bit it can't set... - */ - if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) - val |= CPTR_EL2_TCPAC; - - /* - * Layer the guest hypervisor's trap configuration on top of our own if - * we're in a nested context. - */ - if (is_hyp_ctxt(vcpu)) - goto write; - - cptr = vcpu_sanitised_cptr_el2(vcpu); - - /* - * Pay attention, there's some interesting detail here. - * - * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two - * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): - * - * - CPTR_EL2.xEN = x0, traps are enabled - * - CPTR_EL2.xEN = x1, traps are disabled - * - * In other words, bit[0] determines if guest accesses trap or not. In - * the interest of simplicity, clear the entire field if the guest - * hypervisor has traps enabled to dispel any illusion of something more - * complicated taking place. - */ - if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_FPEN; - if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_ZEN; + host_data_clear_flag(VCPU_IN_HYP_CONTEXT); - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_EL1_E0POE; + guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + if (guest_hcr & HCR_NV) { + u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); - val |= cptr & CPTR_EL2_TCPAC; + /* Inherit the low bits from the actual register */ + va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); + write_sysreg_s(va, SYS_VNCR_EL2); -write: - write_sysreg(val, cpacr_el1); -} + /* Force NV2 in case the guest is forgetful... */ + guest_hcr |= HCR_NV2; + } -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; + /* + * Exclude the guest's TWED configuration if it hasn't set TWE + * to avoid potentially delaying traps for the host. + */ + if (!(guest_hcr & HCR_TWE)) + guest_hcr &= ~(HCR_EL2_TWEDEn | HCR_EL2_TWEDEL); + } - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; + BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && + host_data_test_flag(L1_VNCR_MAPPED)); - write_sysreg(val, cpacr_el1); + return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } static void __activate_traps(struct kvm_vcpu *vcpu) @@ -184,7 +148,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) ___deactivate_traps(vcpu); - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); if (has_cntpoff()) { struct timer_map map; @@ -198,9 +162,9 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ val = read_sysreg_el0(SYS_CNTP_CVAL); if (map.direct_ptimer == vcpu_ptimer(vcpu)) - __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val; + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val); if (map.direct_ptimer == vcpu_hptimer(vcpu)) - __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val; + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val); offset = read_sysreg_s(SYS_CNTPOFF_EL2); @@ -459,6 +423,14 @@ static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) if (ret) return false; + /* + * If we have to check for any VNCR mapping being invalidated, + * go back to the slow path for further processing. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && + atomic_read(&vcpu->kvm->arch.vncr_map_count)) + return false; + __kvm_skip_instr(vcpu); return true; @@ -527,6 +499,25 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) return kvm_hyp_handle_sysreg(vcpu, exit_code); } +static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 iss; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return false; + + /* + * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 + * is populated with a correct ISS for a sysreg trap. These fruity + * parts are 64bit only, so unconditionally set IL. + */ + iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); + vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | + FIELD_PREP(ESR_ELx_ISS_MASK, iss) | + ESR_ELx_IL; + return false; +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, @@ -538,17 +529,23 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + + /* Apple shenanigans */ + [0x3F] = kvm_hyp_handle_impdef, }; static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - synchronize_vcpu_pstate(vcpu, exit_code); + synchronize_vcpu_pstate(vcpu); /* * If we were in HYP context on entry, adjust the PSTATE view - * so that the usual helpers work correctly. + * so that the usual helpers work correctly. This enforces our + * invariant that the guest's HYP context status is preserved + * across a run. */ - if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) { + if (vcpu_has_nv(vcpu) && + unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { @@ -564,6 +561,10 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) |= mode; } + /* Apply extreme paranoia! */ + BUG_ON(vcpu_has_nv(vcpu) && + !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } @@ -577,10 +578,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - sysreg_save_host_state_vhe(host_ctxt); - fpsimd_lazy_switch_to_guest(vcpu); + sysreg_save_host_state_vhe(host_ctxt); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -605,15 +606,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); - fpsimd_lazy_switch_to_host(vcpu); - sysreg_restore_host_state_vhe(host_ctxt); + __debug_switch_to_host(vcpu); + + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ + isb(); + + fpsimd_lazy_switch_to_host(vcpu); + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); - __debug_switch_to_host(vcpu); - return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); @@ -643,12 +652,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ local_daif_restore(DAIF_PROCCTX_NOIRQ); - /* - * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() - * to make sure these changes take effect before running the host or - * additional guests. - */ return ret; } @@ -660,7 +663,8 @@ static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; - __deactivate_traps(vcpu); + if (vcpu) + __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 90b018e06f2c..be685b63e8cf 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -18,17 +18,17 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) { /* These registers are common with EL1 */ - __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); - __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); - - __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); - __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); - __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); - __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); - __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); - __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); - __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); - __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1)); + __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1)); + + __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR)); + __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0)); + __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1)); + __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR)); + __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR)); + __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR)); + __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR)); + __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR)); /* * In VHE mode those registers are compatible between EL1 and EL2, @@ -46,21 +46,21 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) * are always trapped, ensuring that the in-memory * copy is always up-to-date. A small blessing... */ - __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); - __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); - __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); - __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR)); + __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0)); + __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1)); + __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR)); if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { - __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2)); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { - __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); - __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); + __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0)); + __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR)); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) - __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); + __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR)); } /* @@ -70,13 +70,16 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) */ val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; - __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; - __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS); + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val); } - __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); - __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); - __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); + __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); + __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR)); + __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR)); + + if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) + __vcpu_assign_sys_reg(vcpu, SCTLR2_EL2, read_sysreg_el1(SYS_SCTLR2)); } static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) @@ -87,11 +90,12 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); - write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); - write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); if (vcpu_el2_e2h_is_set(vcpu)) { /* @@ -138,6 +142,9 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); + + if (ctxt_has_sctlr2(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR2_EL2), SYS_SCTLR2); } /* @@ -176,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); +/* + * The _EL0 value was written by the host's context switch and belongs to the + * VMM. Copy this into the guest's _EL1 register. + */ +static inline void __mpam_guest_load(void) +{ + u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I; + + if (system_supports_mpam()) { + u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN; + + write_sysreg_el1(val, SYS_MPAM1); + } +} + /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * @@ -191,14 +213,14 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - u64 mpidr; + u64 midr, mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); /* * When running a normal EL1 guest, we only load a new vcpu - * after a context switch, which imvolves a DSB, so all + * after a context switch, which involves a DSB, so all * speculative EL1&0 walks will have already completed. * If running NV, the vcpu may transition between vEL1 and * vEL2 without a context switch, so make sure we complete @@ -215,28 +237,24 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); + __mpam_guest_load(); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { /* - * Use the guest hypervisor's VPIDR_EL2 when in a - * nested state. The hardware value of MIDR_EL1 gets - * restored on put. - */ - write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); - - /* * As we're restoring a nested guest, set the value * provided by the guest hypervisor. */ + midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); } else { + midr = ctxt_midr_el1(guest_ctxt); mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); } - __sysreg_restore_el1_state(guest_ctxt, mpidr); + __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); @@ -271,9 +289,5 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); - /* If leaving a nesting guest, restore MIDR_EL1 default view */ - if (vcpu_has_nv(vcpu)) - write_sysreg(read_cpuid_id(), vpidr_el2); - vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 3d50a1bd2bdb..f7b9dfe3f3a5 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -63,7 +63,7 @@ static void enter_vmid_context(struct kvm_s2_mmu *mmu, __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); } @@ -73,7 +73,7 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) * We're done with the TLB operation, let's restore the host's * view of HCR_EL2. */ - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); isb(); /* ... and the stage-2 MMU context that we switched away from */ @@ -104,7 +104,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* @@ -115,7 +114,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -136,7 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* @@ -176,7 +174,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -192,7 +190,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -217,7 +215,7 @@ void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } /* @@ -358,7 +356,7 @@ int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) default: ret = -EINVAL; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); if (mmu) |
