diff options
58 files changed, 1409 insertions, 387 deletions
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index b1e43f56ee46..6c13fd47e170 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -56,6 +56,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64mmfr1; u64 reg_id_aa64mmfr2; u64 reg_id_aa64mmfr3; + u64 reg_id_aa64mmfr4; u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; u64 reg_id_aa64zfr0; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index bd8d4ca81a48..d7c6988188d9 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -363,6 +363,7 @@ struct arm64_cpu_capabilities { u8 field_pos; u8 field_width; u8 min_field_value; + u8 max_field_value; u8 hwcap_type; bool sign; unsigned long hwcap; diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 3c6f8ba1e479..a1769e415d72 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -102,9 +102,7 @@ #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) -#define HCRX_GUEST_FLAGS \ - (HCRX_EL2_SMPME | HCRX_EL2_TCR2En | \ - (cpus_have_final_cap(ARM64_HAS_MOPS) ? (HCRX_EL2_MSCEn | HCRX_EL2_MCE2) : 0)) +#define HCRX_GUEST_FLAGS (HCRX_EL2_SMPME | HCRX_EL2_TCR2En) #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En) /* TCR_EL2 Registers bits */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index b804fe832184..debc3753d2ef 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -209,7 +209,8 @@ static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu) static inline bool __vcpu_el2_e2h_is_set(const struct kvm_cpu_context *ctxt) { - return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H; + return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || + (ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H)); } static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 21c57b812569..6883963bbc3a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -238,9 +238,32 @@ static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr) return index; } +struct kvm_sysreg_masks; + +enum fgt_group_id { + __NO_FGT_GROUP__, + HFGxTR_GROUP, + HDFGRTR_GROUP, + HDFGWTR_GROUP = HDFGRTR_GROUP, + HFGITR_GROUP, + HAFGRTR_GROUP, + + /* Must be last */ + __NR_FGT_GROUP_IDS__ +}; + struct kvm_arch { struct kvm_s2_mmu mmu; + /* + * Fine-Grained UNDEF, mimicking the FGT layout defined by the + * architecture. We track them globally, as we present the + * same feature-set to all vcpus. + * + * Index 0 is currently spare. + */ + u64 fgu[__NR_FGT_GROUP_IDS__]; + /* Interrupt controller */ struct vgic_dist vgic; @@ -274,6 +297,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 /* Initial ID reg values loaded */ #define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 7 + /* Fine-Grained UNDEF initialised */ +#define KVM_ARCH_FLAG_FGU_INITIALIZED 8 unsigned long flags; /* VM-wide vCPU feature set */ @@ -294,6 +319,9 @@ struct kvm_arch { /* PMCR_EL0.N value for the guest */ u8 pmcr_n; + /* Iterator for idreg debugfs */ + u8 idreg_debugfs_iter; + /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; @@ -312,6 +340,9 @@ struct kvm_arch { #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) u64 id_regs[KVM_ARM_ID_REG_NUM]; + /* Masks for VNCR-baked sysregs */ + struct kvm_sysreg_masks *sysreg_masks; + /* * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. @@ -474,6 +505,13 @@ enum vcpu_sysreg { NR_SYS_REGS /* Nothing after this line! */ }; +struct kvm_sysreg_masks { + struct { + u64 res0; + u64 res1; + } mask[NR_SYS_REGS - __VNCR_START__]; +}; + struct kvm_cpu_context { struct user_pt_regs regs; /* sp = sp_el0 */ @@ -549,6 +587,7 @@ struct kvm_vcpu_arch { /* Values of trap registers for the guest. */ u64 hcr_el2; + u64 hcrx_el2; u64 mdcr_el2; u64 cptr_el2; @@ -868,7 +907,15 @@ static inline u64 *__ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) -#define __vcpu_sys_reg(v,r) (ctxt_sys_reg(&(v)->arch.ctxt, (r))) +u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *, enum vcpu_sysreg); +#define __vcpu_sys_reg(v,r) \ + (*({ \ + const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ + u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ + if (vcpu_has_nv((v)) && (r) >= __VNCR_START__) \ + *__r = kvm_vcpu_sanitise_vncr_reg((v), (r)); \ + __r; \ + })) u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); @@ -1055,14 +1102,20 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); +void kvm_sys_regs_create_debugfs(struct kvm *kvm); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); int __init kvm_sys_reg_table_init(void); +struct sys_reg_desc; +int __init populate_sysreg_config(const struct sys_reg_desc *sr, + unsigned int idx); int __init populate_nv_trap_config(void); bool lock_all_vcpus(struct kvm *kvm); void unlock_all_vcpus(struct kvm *kvm); +void kvm_init_sysreg(struct kvm_vcpu *); + /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); @@ -1233,4 +1286,48 @@ static inline void kvm_hyp_reserve(void) { } void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); +#define __expand_field_sign_unsigned(id, fld, val) \ + ((u64)SYS_FIELD_VALUE(id, fld, val)) + +#define __expand_field_sign_signed(id, fld, val) \ + ({ \ + u64 __val = SYS_FIELD_VALUE(id, fld, val); \ + sign_extend64(__val, id##_##fld##_WIDTH - 1); \ + }) + +#define expand_field_sign(id, fld, val) \ + (id##_##fld##_SIGNED ? \ + __expand_field_sign_signed(id, fld, val) : \ + __expand_field_sign_unsigned(id, fld, val)) + +#define get_idreg_field_unsigned(kvm, id, fld) \ + ({ \ + u64 __val = IDREG((kvm), SYS_##id); \ + FIELD_GET(id##_##fld##_MASK, __val); \ + }) + +#define get_idreg_field_signed(kvm, id, fld) \ + ({ \ + u64 __val = get_idreg_field_unsigned(kvm, id, fld); \ + sign_extend64(__val, id##_##fld##_WIDTH - 1); \ + }) + +#define get_idreg_field_enum(kvm, id, fld) \ + get_idreg_field_unsigned(kvm, id, fld) + +#define get_idreg_field(kvm, id, fld) \ + (id##_##fld##_SIGNED ? \ + get_idreg_field_signed(kvm, id, fld) : \ + get_idreg_field_unsigned(kvm, id, fld)) + +#define kvm_has_feat(kvm, id, fld, limit) \ + (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, limit)) + +#define kvm_has_feat_enum(kvm, id, fld, val) \ + (get_idreg_field_unsigned((kvm), id, fld) == __expand_field_sign_unsigned(id, fld, val)) + +#define kvm_has_feat_range(kvm, id, fld, min, max) \ + (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, min) && \ + get_idreg_field((kvm), id, fld) <= expand_field_sign(id, fld, max)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 145ce73fc16c..3e2a1ac0c9bb 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -70,7 +70,7 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); /* * Without an __arch_swab32(), we fall back to ___constant_swab32(), but the * static inline can allow the compiler to out-of-line this. KVM always wants - * the macro version as its always inlined. + * the macro version as it's always inlined. */ #define __kvm_swab32(x) ___constant_swab32(x) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e3e793d0ec30..d5e48d870461 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -54,27 +54,6 @@ #include <asm/alternative.h> /* - * Convert a kernel VA into a HYP VA. - * reg: VA to be converted. - * - * The actual code generation takes place in kvm_update_va_mask, and - * the instructions below are only there to reserve the space and - * perform the register allocation (kvm_update_va_mask uses the - * specific registers encoded in the instructions). - */ -.macro kern_hyp_va reg -#ifndef __KVM_VHE_HYPERVISOR__ -alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask - and \reg, \reg, #1 /* mask with va_mask */ - ror \reg, \reg, #1 /* rotate to the first tag bit */ - add \reg, \reg, #0 /* insert the low 12 bits of the tag */ - add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */ - ror \reg, \reg, #63 /* rotate back */ -alternative_cb_end -#endif -.endm - -/* * Convert a hypervisor VA to a PA * reg: hypervisor address to be converted in place * tmp: temporary register @@ -127,14 +106,29 @@ void kvm_apply_hyp_relocations(void); #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) +/* + * Convert a kernel VA into a HYP VA. + * + * Can be called from hyp or non-hyp context. + * + * The actual code generation takes place in kvm_update_va_mask(), and + * the instructions below are only there to reserve the space and + * perform the register allocation (kvm_update_va_mask() uses the + * specific registers encoded in the instructions). + */ static __always_inline unsigned long __kern_hyp_va(unsigned long v) { +/* + * This #ifndef is an optimisation for when this is called from VHE hyp + * context. When called from a VHE non-hyp context, kvm_update_va_mask() will + * replace the instructions with `nop`s. + */ #ifndef __KVM_VHE_HYPERVISOR__ - asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" - "ror %0, %0, #1\n" - "add %0, %0, #0\n" - "add %0, %0, #0, lsl 12\n" - "ror %0, %0, #63\n", + asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" /* mask with va_mask */ + "ror %0, %0, #1\n" /* rotate to the first tag bit */ + "add %0, %0, #0\n" /* insert the low 12 bits of the tag */ + "add %0, %0, #0, lsl 12\n" /* insert the top 12 bits of the tag */ + "ror %0, %0, #63\n", /* rotate back */ ARM64_ALWAYS_SYSTEM, kvm_update_va_mask) : "+r" (v)); diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 4882905357f4..c77d795556e1 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -60,7 +60,6 @@ static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) return ttbr0 & ~GENMASK_ULL(63, 48); } -extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); int kvm_init_nv_sysregs(struct kvm *kvm); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index cfdf40f734b1..19278dfe7978 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -197,6 +197,7 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_NORMAL_NC: Normal noncacheable attributes. * @KVM_PGTABLE_PROT_SW0: Software bit 0. * @KVM_PGTABLE_PROT_SW1: Software bit 1. * @KVM_PGTABLE_PROT_SW2: Software bit 2. @@ -208,6 +209,7 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), + KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index d82305ab420f..449ca2ff1df6 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -173,6 +173,7 @@ * Memory types for Stage-2 translation */ #define MT_S2_NORMAL 0xf +#define MT_S2_NORMAL_NC 0x5 #define MT_S2_DEVICE_nGnRE 0x1 /* @@ -180,6 +181,7 @@ * Stage-2 enforces Normal-WB and Device-nGnRE */ #define MT_S2_FWB_NORMAL 6 +#define MT_S2_FWB_NORMAL_NC 5 #define MT_S2_FWB_DEVICE_nGnRE 1 #ifdef CONFIG_ARM64_4K_PAGES diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index c3b19b376c86..9e8999592f3a 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1181,6 +1181,8 @@ par; \ }) +#define SYS_FIELD_VALUE(reg, field, val) reg##_##field##_##val + #define SYS_FIELD_GET(reg, field, val) \ FIELD_GET(reg##_##field##_MASK, val) @@ -1188,7 +1190,8 @@ FIELD_PREP(reg##_##field##_MASK, val) #define SYS_FIELD_PREP_ENUM(reg, field, val) \ - FIELD_PREP(reg##_##field##_MASK, reg##_##field##_##val) + FIELD_PREP(reg##_##field##_MASK, \ + SYS_FIELD_VALUE(reg, field, val)) #endif diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 8d1a634a403e..f309fd542c20 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -140,12 +140,42 @@ void dump_cpu_features(void) pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); } +#define __ARM64_MAX_POSITIVE(reg, field) \ + ((reg##_##field##_SIGNED ? \ + BIT(reg##_##field##_WIDTH - 1) : \ + BIT(reg##_##field##_WIDTH)) - 1) + +#define __ARM64_MIN_NEGATIVE(reg, field) BIT(reg##_##field##_WIDTH - 1) + +#define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value) \ + .sys_reg = SYS_##reg, \ + .field_pos = reg##_##field##_SHIFT, \ + .field_width = reg##_##field##_WIDTH, \ + .sign = reg##_##field##_SIGNED, \ + .min_field_value = min_value, \ + .max_field_value = max_value, + +/* + * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to + * an implicit maximum that depends on the sign-ess of the field. + * + * An unsigned field will be capped at all ones, while a signed field + * will be limited to the positive half only. + */ #define ARM64_CPUID_FIELDS(reg, field, min_value) \ - .sys_reg = SYS_##reg, \ - .field_pos = reg##_##field##_SHIFT, \ - .field_width = reg##_##field##_WIDTH, \ - .sign = reg##_##field##_SIGNED, \ - .min_field_value = reg##_##field##_##min_value, + __ARM64_CPUID_FIELDS(reg, field, \ + SYS_FIELD_VALUE(reg, field, min_value), \ + __ARM64_MAX_POSITIVE(reg, field)) + +/* + * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an + * implicit minimal value to max_value. This should be used when + * matching a non-implemented property. + */ +#define ARM64_CPUID_FIELDS_NEG(reg, field, max_value) \ + __ARM64_CPUID_FIELDS(reg, field, \ + __ARM64_MIN_NEGATIVE(reg, field), \ + SYS_FIELD_VALUE(reg, field, max_value)) #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ @@ -407,6 +437,11 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), @@ -724,6 +759,7 @@ static const struct __ftr_reg_entry { &id_aa64mmfr1_override), ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), + ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -919,7 +955,8 @@ static void init_cpu_ftr_reg(u32 sys_reg, u64 new) pr_warn("%s[%d:%d]: %s to %llx\n", reg->name, ftrp->shift + ftrp->width - 1, - ftrp->shift, str, tmp); + ftrp->shift, str, + tmp & (BIT(ftrp->width) - 1)); } else if ((ftr_mask & reg->override->val) == ftr_mask) { reg->override->val &= ~ftr_mask; pr_warn("%s[%d:%d]: impossible override, ignored\n", @@ -1047,6 +1084,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); + init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); @@ -1418,6 +1456,7 @@ u64 __read_sysreg_by_encoding(u32 sys_id) read_sysreg_case(SYS_ID_AA64MMFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64MMFR3_EL1); + read_sysreg_case(SYS_ID_AA64MMFR4_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); read_sysreg_case(SYS_ID_AA64ISAR2_EL1); @@ -1451,11 +1490,28 @@ has_always(const struct arm64_cpu_capabilities *entry, int scope) static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { - int val = cpuid_feature_extract_field_width(reg, entry->field_pos, - entry->field_width, - entry->sign); + int val, min, max; + u64 tmp; + + val = cpuid_feature_extract_field_width(reg, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->min_field_value; + tmp <<= entry->field_pos; - return val >= entry->min_field_value; + min = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + tmp = entry->max_field_value; + tmp <<= entry->field_pos; + + max = cpuid_feature_extract_field_width(tmp, entry->field_pos, + entry->field_width, + entry->sign); + + return val >= min && val <= max; } static u64 @@ -1739,6 +1795,28 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return !meltdown_safe; } +static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) +{ + /* + * Although the Apple M2 family appears to support NV1, the + * PTW barfs on the nVHE EL2 S1 page table format. Pretend + * that it doesn't support NV1 at all. + */ + static const struct midr_range nv1_ni_list[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {} + }; + + return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && + !(has_cpuid_feature(entry, scope) || + is_midr_in_range_list(read_cpuid_id(), nv1_ni_list))); +} + #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) static bool has_lpa2_at_stage1(u64 mmfr0) { @@ -2739,6 +2817,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_lpa2, }, + { + .desc = "NV1", + .capability = ARM64_HAS_HCR_NV1, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_nv1, + ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) + }, {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 47043c0d95ec..7ca3fbd200f0 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -447,6 +447,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1); info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1); + info->reg_id_aa64mmfr4 = read_cpuid(ID_AA64MMFR4_EL1); info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1); diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index cab7f91949d8..5bdafbcff009 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -584,25 +584,32 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x1, INIT_SCTLR_EL1_MMU_OFF /* - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, - * making it impossible to start in nVHE mode. Is that - * compliant with the architecture? Absolutely not! + * Compliant CPUs advertise their VHE-onlyness with + * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be + * RES1 in that case. + * + * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, but + * don't advertise it (they predate this relaxation). */ + mrs_s x0, SYS_ID_AA64MMFR4_EL1 + ubfx x0, x0, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH + tbnz x0, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f + mrs x0, hcr_el2 and x0, x0, #HCR_E2H - cbz x0, 1f - + cbz x0, 2f +1: /* Set a sane SCTLR_EL1, the VHE way */ pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 mov x2, #BOOT_CPU_FLAG_E2H - b 2f + b 3f -1: +2: pre_disable_mmu_workaround msr sctlr_el1, x1 mov x2, xzr -2: +3: __init_el2_nvhe_prepare_eret mov w0, #BOOT_CPU_MODE_EL2 diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 4b25fa4b167c..937f15b7d8c3 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -37,7 +37,6 @@ menuconfig KVM select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS - select XARRAY_MULTI help Support hosting virtualized guest machines. @@ -66,4 +65,15 @@ config PROTECTED_NVHE_STACKTRACE If unsure, or not using protected nVHE (pKVM), say N. +config KVM_ARM64_RES_BITS_PARANOIA + bool "Build-time check of RES0/RES1 bits" + depends on KVM + default n + help + Say Y here to validate that KVM's knowledge of most system + registers' RES0/RES1 bits matches when the rest of the kernel + defines. Expect the build to fail badly if you enable this. + + Just say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 9dec8c419bf4..879982b1cc73 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -745,7 +745,7 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, WARN_ON_ONCE(ret); /* - * The virtual offset behaviour is "interresting", as it + * The virtual offset behaviour is "interesting", as it * always applies when HCR_EL2.E2H==0, but only when * accessed from EL1 when HCR_EL2.E2H==1. So make sure we * track E2H when putting the HV timer in "direct" mode. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a25265aca432..3dee5490eea9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -190,6 +190,10 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + kvm_sys_regs_create_debugfs(kvm); +} /** * kvm_arch_destroy_vm - destroy the VM data structure @@ -206,6 +210,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) pkvm_destroy_hyp_vm(kvm); kfree(kvm->arch.mpidr_data); + kfree(kvm->arch.sysreg_masks); kvm_destroy_vcpus(kvm); kvm_unshare_hyp(kvm, kvm + 1); @@ -674,6 +679,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } + /* + * This needs to happen after NV has imposed its own restrictions on + * the feature set + */ + kvm_init_sysreg(vcpu); + ret = kvm_timer_enable(vcpu); if (ret) return ret; @@ -2591,7 +2602,8 @@ static __init int kvm_arm_init(void) } else if (in_hyp_mode) { kvm_info("VHE mode initialized successfully\n"); } else { - kvm_info("Hyp mode initialized successfully\n"); + char mode = cpus_have_final_cap(ARM64_KVM_HVHE) ? 'h' : 'n'; + kvm_info("Hyp mode (%cVHE) initialized successfully\n", mode); } /* diff --git a/arch/arm64/kvm/check-res-bits.h b/arch/arm64/kvm/check-res-bits.h new file mode 100644 index 000000000000..2d98e60efc3c --- /dev/null +++ b/arch/arm64/kvm/check-res-bits.h @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 - Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <asm/sysreg-defs.h> + +/* + * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING + * + * If any of these BUILD_BUG_ON() fails, that's because some bits that + * were reserved have gained some other meaning, and KVM needs to know + * about those. + * + * In such case, do *NOT* blindly change the assertion so that it + * passes, but also teach the rest of the code about the actual + * change. + * + * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING + */ +static inline void check_res_bits(void) +{ +#ifdef CONFIG_KVM_ARM64_RES_BITS_PARANOIA + + BUILD_BUG_ON(OSDTRRX_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(MDCCINT_EL1_RES0 != (GENMASK_ULL(63, 31) | GENMASK_ULL(28, 0))); + BUILD_BUG_ON(MDSCR_EL1_RES0 != (GENMASK_ULL(63, 36) | GENMASK_ULL(28, 28) | GENMASK_ULL(25, 24) | GENMASK_ULL(20, 20) | GENMASK_ULL(18, 16) | GENMASK_ULL(11, 7) | GENMASK_ULL(5, 1))); + BUILD_BUG_ON(OSDTRTX_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(OSECCR_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(OSLAR_EL1_RES0 != (GENMASK_ULL(63, 1))); + BUILD_BUG_ON(ID_PFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_PFR1_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_DFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_AFR0_EL1_RES0 != (GENMASK_ULL(63, 16))); + BUILD_BUG_ON(ID_MMFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_MMFR1_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_MMFR2_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_MMFR3_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_ISAR0_EL1_RES0 != (GENMASK_ULL(63, 28))); + BUILD_BUG_ON(ID_ISAR1_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_ISAR2_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_ISAR3_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_ISAR4_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_ISAR5_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(23, 20))); + BUILD_BUG_ON(ID_ISAR6_EL1_RES0 != (GENMASK_ULL(63, 28))); + BUILD_BUG_ON(ID_MMFR4_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(MVFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(MVFR1_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(MVFR2_EL1_RES0 != (GENMASK_ULL(63, 8))); + BUILD_BUG_ON(ID_PFR2_EL1_RES0 != (GENMASK_ULL(63, 12))); + BUILD_BUG_ON(ID_DFR1_EL1_RES0 != (GENMASK_ULL(63, 8))); + BUILD_BUG_ON(ID_MMFR5_EL1_RES0 != (GENMASK_ULL(63, 8))); + BUILD_BUG_ON(ID_AA64PFR1_EL1_RES0 != (GENMASK_ULL(23, 20))); + BUILD_BUG_ON(ID_AA64PFR2_EL1_RES0 != (GENMASK_ULL(63, 36) | GENMASK_ULL(31, 12))); + BUILD_BUG_ON(ID_AA64ZFR0_EL1_RES0 != (GENMASK_ULL(63, 60) | GENMASK_ULL(51, 48) | GENMASK_ULL(39, 36) | GENMASK_ULL(31, 28) | GENMASK_ULL(15, 8))); + BUILD_BUG_ON(ID_AA64SMFR0_EL1_RES0 != (GENMASK_ULL(62, 61) | GENMASK_ULL(51, 49) | GENMASK_ULL(31, 31) | GENMASK_ULL(27, 0))); + BUILD_BUG_ON(ID_AA64FPFR0_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 2))); + BUILD_BUG_ON(ID_AA64DFR0_EL1_RES0 != (GENMASK_ULL(27, 24) | GENMASK_ULL(19, 16))); + BUILD_BUG_ON(ID_AA64DFR1_EL1_RES0 != (GENMASK_ULL(63, 0))); + BUILD_BUG_ON(ID_AA64AFR0_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(ID_AA64AFR1_EL1_RES0 != (GENMASK_ULL(63, 0))); + BUILD_BUG_ON(ID_AA64ISAR0_EL1_RES0 != (GENMASK_ULL(3, 0))); + BUILD_BUG_ON(ID_AA64ISAR2_EL1_RES0 != (GENMASK_ULL(47, 44))); + BUILD_BUG_ON(ID_AA64ISAR3_EL1_RES0 != (GENMASK_ULL(63, 16))); + BUILD_BUG_ON(ID_AA64MMFR0_EL1_RES0 != (GENMASK_ULL(55, 48))); + BUILD_BUG_ON(ID_AA64MMFR2_EL1_RES0 != (GENMASK_ULL(47, 44))); + BUILD_BUG_ON(ID_AA64MMFR3_EL1_RES0 != (GENMASK_ULL(51, 48))); + BUILD_BUG_ON(ID_AA64MMFR4_EL1_RES0 != (GENMASK_ULL(63, 40) | GENMASK_ULL(35, 28) | GENMASK_ULL(3, 0))); + BUILD_BUG_ON(SCTLR_EL1_RES0 != (GENMASK_ULL(17, 17))); + BUILD_BUG_ON(CPACR_ELx_RES0 != (GENMASK_ULL(63, 30) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | GENMASK_ULL(19, 18) | GENMASK_ULL(15, 0))); + BUILD_BUG_ON(SMPRI_EL1_RES0 != (GENMASK_ULL(63, 4))); + BUILD_BUG_ON(ZCR_ELx_RES0 != (GENMASK_ULL(63, 9))); + BUILD_BUG_ON(SMCR_ELx_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(29, 9))); + BUILD_BUG_ON(GCSCR_ELx_RES0 != (GENMASK_ULL(63, 10) | GENMASK_ULL(7, 7) | GENMASK_ULL(4, 1))); + BUILD_BUG_ON(GCSPR_ELx_RES0 != (GENMASK_ULL(2, 0))); + BUILD_BUG_ON(GCSCRE0_EL1_RES0 != (GENMASK_ULL(63, 11) | GENMASK_ULL(7, 6) | GENMASK_ULL(4, 1))); + BUILD_BUG_ON(ALLINT_RES0 != (GENMASK_ULL(63, 14) | GENMASK_ULL(12, 0))); + BUILD_BUG_ON(PMSCR_EL1_RES0 != (GENMASK_ULL(63, 8) | GENMASK_ULL(2, 2))); + BUILD_BUG_ON(PMSICR_EL1_RES0 != (GENMASK_ULL(55, 32))); + BUILD_BUG_ON(PMSIRR_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(7, 1))); + BUILD_BUG_ON(PMSFCR_EL1_RES0 != (GENMASK_ULL(63, 19) | GENMASK_ULL(15, 4))); + BUILD_BUG_ON(PMSLATFR_EL1_RES0 != (GENMASK_ULL(63, 16))); + BUILD_BUG_ON(PMSIDR_EL1_RES0 != (GENMASK_ULL(63, 25) | GENMASK_ULL(7, 7))); + BUILD_BUG_ON(PMBLIMITR_EL1_RES0 != (GENMASK_ULL(11, 6) | GENMASK_ULL(4, 3))); + BUILD_BUG_ON(PMBSR_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(25, 20))); + BUILD_BUG_ON(PMBIDR_EL1_RES0 != (GENMASK_ULL(63, 12) | GENMASK_ULL(7, 6))); + BUILD_BUG_ON(CONTEXTIDR_ELx_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(CCSIDR_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(CLIDR_EL1_RES0 != (GENMASK_ULL(63, 47))); + BUILD_BUG_ON(CCSIDR2_EL1_RES0 != (GENMASK_ULL(63, 24))); + BUILD_BUG_ON(GMID_EL1_RES0 != (GENMASK_ULL(63, 4))); + BUILD_BUG_ON(SMIDR_EL1_RES0 != (GENMASK_ULL(63, 32) | GENMASK_ULL(14, 12))); + BUILD_BUG_ON(CSSELR_EL1_RES0 != (GENMASK_ULL(63, 5))); + BUILD_BUG_ON(CTR_EL0_RES0 != (GENMASK_ULL(63, 38) | GENMASK_ULL(30, 30) | GENMASK_ULL(13, 4))); + BUILD_BUG_ON(CTR_EL0_RES1 != (GENMASK_ULL(31, 31))); + BUILD_BUG_ON(DCZID_EL0_RES0 != (GENMASK_ULL(63, 5))); + BUILD_BUG_ON(SVCR_RES0 != (GENMASK_ULL(63, 2))); + BUILD_BUG_ON(FPMR_RES0 != (GENMASK_ULL(63, 38) | GENMASK_ULL(23, 23) | GENMASK_ULL(13, 9))); + BUILD_BUG_ON(HFGxTR_EL2_RES0 != (GENMASK_ULL(51, 51))); + BUILD_BUG_ON(HFGITR_EL2_RES0 != (GENMASK_ULL(63, 63) | GENMASK_ULL(61, 61))); + BUILD_BUG_ON(HDFGRTR_EL2_RES0 != (GENMASK_ULL(49, 49) | GENMASK_ULL(42, 42) | GENMASK_ULL(39, 38) | GENMASK_ULL(21, 20) | GENMASK_ULL(8, 8))); + BUILD_BUG_ON(HDFGWTR_EL2_RES0 != (GENMASK_ULL(63, 63) | GENMASK_ULL(59, 58) | GENMASK_ULL(51, 51) | GENMASK_ULL(47, 47) | GENMASK_ULL(43, 43) | GENMASK_ULL(40, 38) | GENMASK_ULL(34, 34) | GENMASK_ULL(30, 30) | GENMASK_ULL(22, 22) | GENMASK_ULL(9, 9) | GENMASK_ULL(6, 6))); + BUILD_BUG_ON(HAFGRTR_EL2_RES0 != (GENMASK_ULL(63, 50) | GENMASK_ULL(16, 5))); + BUILD_BUG_ON(HCRX_EL2_RES0 != (GENMASK_ULL(63, 25) | GENMASK_ULL(13, 12))); + BUILD_BUG_ON(DACR32_EL2_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(PMSCR_EL2_RES0 != (GENMASK_ULL(63, 8) | GENMASK_ULL(2, 2))); + BUILD_BUG_ON(TCR2_EL1x_RES0 != (GENMASK_ULL(63, 16) | GENMASK_ULL(13, 12) | GENMASK_ULL(9, 6))); + BUILD_BUG_ON(TCR2_EL2_RES0 != (GENMASK_ULL(63, 16))); + BUILD_BUG_ON(LORSA_EL1_RES0 != (GENMASK_ULL(63, 52) | GENMASK_ULL(15, 1))); + BUILD_BUG_ON(LOREA_EL1_RES0 != (GENMASK_ULL(63, 52) | GENMASK_ULL(15, 0))); + BUILD_BUG_ON(LORN_EL1_RES0 != (GENMASK_ULL(63, 8))); + BUILD_BUG_ON(LORC_EL1_RES0 != (GENMASK_ULL(63, 10) | GENMASK_ULL(1, 1))); + BUILD_BUG_ON(LORID_EL1_RES0 != (GENMASK_ULL(63, 24) | GENMASK_ULL(15, 8))); + BUILD_BUG_ON(ISR_EL1_RES0 != (GENMASK_ULL(63, 11) | GENMASK_ULL(5, 0))); + BUILD_BUG_ON(ICC_NMIAR1_EL1_RES0 != (GENMASK_ULL(63, 24))); + BUILD_BUG_ON(TRBLIMITR_EL1_RES0 != (GENMASK_ULL(11, 7))); + BUILD_BUG_ON(TRBBASER_EL1_RES0 != (GENMASK_ULL(11, 0))); + BUILD_BUG_ON(TRBSR_EL1_RES0 != (GENMASK_ULL(63, 56) | GENMASK_ULL(25, 24) | GENMASK_ULL(19, 19) | GENMASK_ULL(16, 16))); + BUILD_BUG_ON(TRBMAR_EL1_RES0 != (GENMASK_ULL(63, 12))); + BUILD_BUG_ON(TRBTRG_EL1_RES0 != (GENMASK_ULL(63, 32))); + BUILD_BUG_ON(TRBIDR_EL1_RES0 != (GENMASK_ULL(63, 12) | GENMASK_ULL(7, 6))); + +#endif +} diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 8725291cb00a..ce8886122ed3 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -23,7 +23,7 @@ static DEFINE_PER_CPU(u64, mdcr_el2); -/** +/* * save/restore_guest_debug_regs * * For some debug operations we need to tweak some guest registers. As @@ -143,6 +143,7 @@ void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) /** * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state + * @vcpu: the vcpu pointer */ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 431fd429932d..4697ba41b3a9 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -427,12 +427,14 @@ static const complex_condition_check ccc[] = { * [19:14] bit number in the FGT register (6 bits) * [20] trap polarity (1 bit) * [25:21] FG filter (5 bits) - * [62:26] Unused (37 bits) + * [35:26] Main SysReg table index (10 bits) + * [62:36] Unused (27 bits) * [63] RES0 - Must be zero, as lost on insertion in the xarray */ #define TC_CGT_BITS 10 #define TC_FGT_BITS 4 #define TC_FGF_BITS 5 +#define TC_SRI_BITS 10 union trap_config { u64 val; @@ -442,7 +444,8 @@ union trap_config { unsigned long bit:6; /* Bit number */ unsigned long pol:1; /* Polarity */ unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */ - unsigned long unused:37; /* Unused, should be zero */ + unsigned long sri:TC_SRI_BITS; /* SysReg Index */ + unsigned long unused:27; /* Unused, should be zero */ unsigned long mbz:1; /* Must Be Zero */ }; }; @@ -1006,18 +1009,6 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { static DEFINE_XARRAY(sr_forward_xa); -enum fgt_group_id { - __NO_FGT_GROUP__, - HFGxTR_GROUP, - HDFGRTR_GROUP, - HDFGWTR_GROUP, - HFGITR_GROUP, - HAFGRTR_GROUP, - - /* Must be last */ - __NR_FGT_GROUP_IDS__ -}; - enum fg_filter_id { __NO_FGF__, HCRX_FGTnXS, @@ -1757,6 +1748,28 @@ static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, err); } +static u32 encoding_next(u32 encoding) +{ + u8 op0, op1, crn, crm, op2; + + op0 = sys_reg_Op0(encoding); + op1 = sys_reg_Op1(encoding); + crn = sys_reg_CRn(encoding); + crm = sys_reg_CRm(encoding); + op2 = sys_reg_Op2(encoding); + + if (op2 < Op2_mask) + return sys_reg(op0, op1, crn, crm, op2 + 1); + if (crm < CRm_mask) + return sys_reg(op0, op1, crn, crm + 1, 0); + if (crn < CRn_mask) + return sys_reg(op0, op1, crn + 1, 0, 0); + if (op1 < Op1_mask) + return sys_reg(op0, op1 + 1, 0, 0, 0); + + return sys_reg(op0 + 1, 0, 0, 0, 0); +} + int __init populate_nv_trap_config(void) { int ret = 0; @@ -1775,23 +1788,18 @@ int __init populate_nv_trap_config(void) ret = -EINVAL; } - if (cgt->encoding != cgt->end) { - prev = xa_store_range(&sr_forward_xa, - cgt->encoding, cgt->end, - xa_mk_value(cgt->tc.val), - GFP_KERNEL); - } else { - prev = xa_store(&sr_forward_xa, cgt->encoding, + for (u32 enc = cgt->encoding; enc <= cgt->end; enc = encoding_next(enc)) { + prev = xa_store(&sr_forward_xa, enc, xa_mk_value(cgt->tc.val), GFP_KERNEL); if (prev && !xa_is_err(prev)) { ret = -EINVAL; print_nv_trap_error(cgt, "Duplicate CGT", ret); } - } - if (xa_is_err(prev)) { - ret = xa_err(prev); - print_nv_trap_error(cgt, "Failed CGT insertion", ret); + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(cgt, "Failed CGT insertion", ret); + } } } @@ -1804,6 +1812,7 @@ int __init populate_nv_trap_config(void) for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; union trap_config tc; + void *prev; if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) { ret = -EINVAL; @@ -1818,8 +1827,13 @@ int __init populate_nv_trap_config(void) } tc.val |= fgt->tc.val; - xa_store(&sr_forward_xa, fgt->encoding, - xa_mk_value(tc.val), GFP_KERNEL); + prev = xa_store(&sr_forward_xa, fgt->encoding, + xa_mk_value(tc.val), GFP_KERNEL); + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(fgt, "Failed FGT insertion", ret); + } } kvm_info("nv: %ld fine grained trap handlers\n", @@ -1845,6 +1859,38 @@ check_mcb: return ret; } +int __init populate_sysreg_config(const struct sys_reg_desc *sr, + unsigned int idx) +{ + union trap_config tc; + u32 encoding; + void *ret; + + /* + * 0 is a valid value for the index, but not for the storage. + * We'll store (idx+1), so check against an offset'd limit. + */ + if (idx >= (BIT(TC_SRI_BITS) - 1)) { + kvm_err("sysreg %s (%d) out of range\n", sr->name, idx); + return -EINVAL; + } + + encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2); + tc = get_trap_config(encoding); + + if (tc.sri) { + kvm_err("sysreg %s (%d) duplicate entry (%d)\n", + sr->name, idx - 1, tc.sri); + return -EINVAL; + } + + tc.sri = idx + 1; + ret = xa_store(&sr_forward_xa, encoding, + xa_mk_value(tc.val), GFP_KERNEL); + + return xa_err(ret); +} + static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, const struct trap_bits *tb) { @@ -1892,20 +1938,64 @@ static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, return __compute_trap_behaviour(vcpu, tc.cgt, b); } -static bool check_fgt_bit(u64 val, const union trap_config tc) +static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) { - return ((val >> tc.bit) & 1) == tc.pol; + struct kvm_sysreg_masks *masks; + + /* Only handle the VNCR-backed regs for now */ + if (sr < __VNCR_START__) + return 0; + + masks = kvm->arch.sysreg_masks; + + return masks->mask[sr - __VNCR_START__].res0; } -#define sanitised_sys_reg(vcpu, reg) \ - ({ \ - u64 __val; \ - __val = __vcpu_sys_reg(vcpu, reg); \ - __val &= ~__ ## reg ## _RES0; \ - (__val); \ - }) +static bool check_fgt_bit(struct kvm *kvm, bool is_read, + u64 val, const union trap_config tc) +{ + enum vcpu_sysreg sr; + + if (tc.pol) + return (val & BIT(tc.bit)); + + /* + * FGTs with negative polarities are an absolute nightmare, as + * we need to evaluate the bit in the light of the feature + * that defines it. WTF were they thinking? + * + * So let's check if the bit has been earmarked as RES0, as + * this indicates an unimplemented feature. + */ + if (val & BIT(tc.bit)) + return false; + + switch ((enum fgt_group_id)tc.fgt) { + case HFGxTR_GROUP: + sr = is_read ? HFGRTR_EL2 : HFGWTR_EL2; + break; + + case HDFGRTR_GROUP: + sr = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; + break; + + case HAFGRTR_GROUP: + sr = HAFGRTR_EL2; + break; + + case HFGITR_GROUP: + sr = HFGITR_EL2; + break; + + default: + WARN_ONCE(1, "Unhandled FGT group"); + return false; + } + + return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); +} -bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) +bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) { union trap_config tc; enum trap_behaviour b; @@ -1913,9 +2003,6 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) u32 sysreg; u64 esr, val; - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) - return false; - esr = kvm_vcpu_get_esr(vcpu); sysreg = esr_sys64_to_sysreg(esr); is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; @@ -1926,13 +2013,27 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) * A value of 0 for the whole entry means that we know nothing * for this sysreg, and that it cannot be re-injected into the * nested hypervisor. In this situation, let's cut it short. - * - * Note that ultimately, we could also make use of the xarray - * to store the index of the sysreg in the local descriptor - * array, avoiding another search... Hint, hint... */ if (!tc.val) - return false; + goto local; + + /* + * If a sysreg can be trapped using a FGT, first check whether we + * trap for the purpose of forbidding the feature. In that case, + * inject an UNDEF. + */ + if (tc.fgt != __NO_FGT_GROUP__ && + (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) { + kvm_inject_undefined(vcpu); + return true; + } + + /* + * If we're not nesting, immediately return to the caller, with the + * sysreg index, should we have it. + */ + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + goto local; switch ((enum fgt_group_id)tc.fgt) { case __NO_FGT_GROUP__: @@ -1940,25 +2041,24 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) case HFGxTR_GROUP: if (is_read) - val = sanitised_sys_reg(vcpu, HFGRTR_EL2); + val = __vcpu_sys_reg(vcpu, HFGRTR_EL2); else - val = sanitised_sys_reg(vcpu, HFGWTR_EL2); + val = __vcpu_sys_reg(vcpu, HFGWTR_EL2); break; case HDFGRTR_GROUP: - case HDFGWTR_GROUP: if (is_read) - val = sanitised_sys_reg(vcpu, HDFGRTR_EL2); + val = __vcpu_sys_reg(vcpu, HDFGRTR_EL2); else - val = sanitised_sys_reg(vcpu, HDFGWTR_EL2); + val = __vcpu_sys_reg(vcpu, HDFGWTR_EL2); break; case HAFGRTR_GROUP: - val = sanitised_sys_reg(vcpu, HAFGRTR_EL2); + val = __vcpu_sys_reg(vcpu, HAFGRTR_EL2); break; case HFGITR_GROUP: - val = sanitised_sys_reg(vcpu, HFGITR_EL2); + val = __vcpu_sys_reg(vcpu, HFGITR_EL2); switch (tc.fgf) { u64 tmp; @@ -1966,7 +2066,7 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) break; case HCRX_FGTnXS: - tmp = sanitised_sys_reg(vcpu, HCRX_EL2); + tmp = __vcpu_sys_reg(vcpu, HCRX_EL2); if (tmp & HCRX_EL2_FGTnXS) tc.fgt = __NO_FGT_GROUP__; } @@ -1975,10 +2075,11 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) case __NR_FGT_GROUP_IDS__: /* Something is really wrong, bail out */ WARN_ONCE(1, "__NR_FGT_GROUP_IDS__"); - return false; + goto local; } - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(val, tc)) + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read, + val, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); @@ -1987,6 +2088,26 @@ bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; +local: + if (!tc.sri) { + struct sys_reg_params params; + + params = esr_sys64_to_params(esr); + + /* + * Check for the IMPDEF range, as per DDI0487 J.a, + * D18.3.2 Reserved encodings for IMPLEMENTATION + * DEFINED registers. + */ + if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011)) + print_sys_reg_msg(¶ms, + "Unsupported guest access at: %lx\n", + *vcpu_pc(vcpu)); + kvm_inject_undefined(vcpu); + return true; + } + + *sr_index = tc.sri - 1; return false; inject: diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 8c1d0d4853df..571cf6eef1e1 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -117,7 +117,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) } /* - * Called just before entering the guest once we are no longer preemptable + * Called just before entering the guest once we are no longer preemptible * and interrupts are disabled. If we have managed to run anything using * FP while we were preemptible (such as off the back of an interrupt), * then neither the host nor the guest own the FP hardware (and it was the diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index aaf1d4939739..6e22e658795a 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -711,6 +711,7 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, /** * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG + * @vcpu: the vCPU pointer * * This is for all registers. */ @@ -729,6 +730,8 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) /** * kvm_arm_copy_reg_indices - get indices of all registers. + * @vcpu: the vCPU pointer + * @uindices: register list to copy * * We do core registers right here, then we append system regs. */ @@ -902,8 +905,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging - * @kvm: pointer to the KVM struct - * @kvm_guest_debug: the ioctl data buffer + * @vcpu: the vCPU pointer + * @dbg: the ioctl data buffer * * This sets up and enables the VM for guest debugging. Userspace * passes in a control flag to enable different debug types and diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c index f98cbe2626a1..8d9670e6615d 100644 --- a/arch/arm64/kvm/hyp/aarch32.c +++ b/arch/arm64/kvm/hyp/aarch32.c @@ -84,7 +84,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) } /** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN @@ -120,7 +120,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) } /** - * kvm_skip_instr - skip a trapped instruction and proceed to the next + * kvm_skip_instr32 - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ void kvm_skip_instr32(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index a038320cdb08..e3fcf8c4d5b4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -79,14 +79,48 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) clr |= ~hfg & __ ## reg ## _nMASK; \ } while(0) -#define update_fgt_traps_cs(vcpu, reg, clr, set) \ +#define reg_to_fgt_group_id(reg) \ + ({ \ + enum fgt_group_id id; \ + switch(reg) { \ + case HFGRTR_EL2: \ + case HFGWTR_EL2: \ + id = HFGxTR_GROUP; \ + break; \ + case HFGITR_EL2: \ + id = HFGITR_GROUP; \ + break; \ + case HDFGRTR_EL2: \ + case HDFGWTR_EL2: \ + id = HDFGRTR_GROUP; \ + break; \ + case HAFGRTR_EL2: \ + id = HAFGRTR_GROUP; \ + break; \ + default: \ + BUILD_BUG_ON(1); \ + } \ + \ + id; \ + }) + +#define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ + do { \ + u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ + set |= hfg & __ ## reg ## _MASK; \ + clr |= hfg & __ ## reg ## _nMASK; \ + } while(0) + +#define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \ do { \ - struct kvm_cpu_context *hctxt = \ - &this_cpu_ptr(&kvm_host_data)->host_ctxt; \ u64 c = 0, s = 0; \ \ ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ - compute_clr_set(vcpu, reg, c, s); \ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \ + compute_clr_set(vcpu, reg, c, s); \ + \ + compute_undef_clr_set(vcpu, kvm, reg, c, s); \ + \ s |= set; \ c |= clr; \ if (c || s) { \ @@ -97,8 +131,8 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } \ } while(0) -#define update_fgt_traps(vcpu, reg) \ - update_fgt_traps_cs(vcpu, reg, 0, 0) +#define update_fgt_traps(hctxt, vcpu, kvm, reg) \ + update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0) /* * Validate the fine grain trap masks. @@ -122,8 +156,7 @@ static inline bool cpu_has_amu(void) static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; - u64 r_val, w_val; + struct kvm *kvm = kern_hyp_va(vcpu->kvm); CHECK_FGT_MASKS(HFGRTR_EL2); CHECK_FGT_MASKS(HFGWTR_EL2); @@ -136,72 +169,45 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - ctxt_sys_reg(hctxt, HFGRTR_EL2) = read_sysreg_s(SYS_HFGRTR_EL2); - ctxt_sys_reg(hctxt, HFGWTR_EL2) = read_sysreg_s(SYS_HFGWTR_EL2); - - if (cpus_have_final_cap(ARM64_SME)) { - tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; - - r_clr |= tmp; - w_clr |= tmp; - } - - /* - * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD. - */ - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) - w_set |= HFGxTR_EL2_TCR_EL1_MASK; - - if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { - compute_clr_set(vcpu, HFGRTR_EL2, r_clr, r_set); - compute_clr_set(vcpu, HFGWTR_EL2, w_clr, w_set); - } - - /* The default to trap everything not handled or supported in KVM. */ - tmp = HFGxTR_EL2_nAMAIR2_EL1 | HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nPOR_EL1 | HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nACCDATA_EL1; - - r_val = __HFGRTR_EL2_nMASK & ~tmp; - r_val |= r_set; - r_val &= ~r_clr; - - w_val = __HFGWTR_EL2_nMASK & ~tmp; - w_val |= w_set; - w_val &= ~w_clr; - - write_sysreg_s(r_val, SYS_HFGRTR_EL2); - write_sysreg_s(w_val, SYS_HFGWTR_EL2); - - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) - return; - - update_fgt_traps(vcpu, HFGITR_EL2); - update_fgt_traps(vcpu, HDFGRTR_EL2); - update_fgt_traps(vcpu, HDFGWTR_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2); + update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0, + cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ? + HFGxTR_EL2_TCR_EL1_MASK : 0); + update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2); if (cpu_has_amu()) - update_fgt_traps(vcpu, HAFGRTR_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2); } +#define __deactivate_fgt(htcxt, vcpu, kvm, reg) \ + do { \ + if ((vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) || \ + kvm->arch.fgu[reg_to_fgt_group_id(reg)]) \ + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ + SYS_ ## reg); \ + } while(0) + static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + struct kvm *kvm = kern_hyp_va(vcpu->kvm); if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - write_sysreg_s(ctxt_sys_reg(hctxt, HFGRTR_EL2), SYS_HFGRTR_EL2); - write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); - - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) - return; - - write_sysreg_s(ctxt_sys_reg(hctxt, HFGITR_EL2), SYS_HFGITR_EL2); - write_sysreg_s(ctxt_sys_reg(hctxt, HDFGRTR_EL2), SYS_HDFGRTR_EL2); - write_sysreg_s(ctxt_sys_reg(hctxt, HDFGWTR_EL2), SYS_HDFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, kvm, HFGRTR_EL2); + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); + else + __deactivate_fgt(hctxt, vcpu, kvm, HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, kvm, HFGITR_EL2); + __deactivate_fgt(hctxt, vcpu, kvm, HDFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, kvm, HDFGWTR_EL2); if (cpu_has_amu()) - write_sysreg_s(ctxt_sys_reg(hctxt, HAFGRTR_EL2), SYS_HAFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); } static inline void __activate_traps_common(struct kvm_vcpu *vcpu) @@ -230,7 +236,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); if (cpus_have_final_cap(ARM64_HAS_HCX)) { - u64 hcrx = HCRX_GUEST_FLAGS; + u64 hcrx = vcpu->arch.hcrx_el2; if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { u64 clr = 0, set = 0; diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index bb6b571ec627..4be6a7fa0070 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -27,16 +27,34 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); } -static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; if (!vcpu) vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + return vcpu; +} + +static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_mte(kern_hyp_va(vcpu->kvm)); } +static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_S1PIE)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -55,7 +73,7 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR); ctxt_sys_reg(ctxt, AMAIR_EL1) = read_sysreg_el1(SYS_AMAIR); ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL); - if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + if (ctxt_has_s1pie(ctxt)) { ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); } @@ -131,7 +149,7 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR); write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1), SYS_AMAIR); write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL); - if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + if (ctxt_has_s1pie(ctxt)) { write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1), SYS_PIR); write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0); } diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 4558c02eb352..7746ea507b6f 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -31,8 +31,8 @@ static void __debug_save_spe(u64 *pmscr_el1) return; /* Yes; save the control register and disable data generation */ - *pmscr_el1 = read_sysreg_s(SYS_PMSCR_EL1); - write_sysreg_s(0, SYS_PMSCR_EL1); + *pmscr_el1 = read_sysreg_el1(SYS_PMSCR); + write_sysreg_el1(0, SYS_PMSCR); isb(); /* Now drain all buffered data to memory */ @@ -48,7 +48,7 @@ static void __debug_restore_spe(u64 pmscr_el1) isb(); /* Re-enable data generation */ - write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); + write_sysreg_el1(pmscr_el1, SYS_PMSCR); } static void __debug_save_trace(u64 *trfcr_el1) @@ -63,8 +63,8 @@ static void __debug_save_trace(u64 *trfcr_el1) * Since access to TRFCR_EL1 is trapped, the guest can't * modify the filtering set by the host. */ - *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1); - write_sysreg_s(0, SYS_TRFCR_EL1); + *trfcr_el1 = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(0, SYS_TRFCR); isb(); /* Drain the trace buffer to memory */ tsb_csync(); @@ -76,7 +76,7 @@ static void __debug_restore_trace(u64 trfcr_el1) return; /* Restore trace filter controls */ - write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1); + write_sysreg_el1(trfcr_el1, SYS_TRFCR); } void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 7693a6757cd7..135cfb294ee5 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -110,7 +110,7 @@ SYM_FUNC_END(__host_enter) * u64 elr, u64 par); */ SYM_FUNC_START(__hyp_do_panic) - /* Prepare and exit to the host's panic funciton. */ + /* Prepare and exit to the host's panic function. */ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index b01a3d1078a8..8850b591d775 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -155,7 +155,7 @@ int hyp_back_vmemmap(phys_addr_t back) start = hyp_memory[i].base; start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE); /* - * The begining of the hyp_vmemmap region for the current + * The beginning of the hyp_vmemmap region for the current * memblock may already be backed by the page backing the end * the previous region, so avoid mapping it twice. */ @@ -408,7 +408,7 @@ static void *admit_host_page(void *arg) return pop_hyp_memcache(host_mc, hyp_phys_to_virt); } -/* Refill our local memcache by poping pages from the one provided by the host. */ +/* Refill our local memcache by popping pages from the one provided by the host. */ int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc) { diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index ab9d05fcf98b..3fae5830f8d2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -717,15 +717,29 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { - bool device = prot & KVM_PGTABLE_PROT_DEVICE; - kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : - KVM_S2_MEMATTR(pgt, NORMAL); + kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + switch (prot & (KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_NORMAL_NC)) { + case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: + return -EINVAL; + case KVM_PGTABLE_PROT_DEVICE: + if (prot & KVM_PGTABLE_PROT_X) + return -EINVAL; + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); + break; + case KVM_PGTABLE_PROT_NORMAL_NC: + if (prot & KVM_PGTABLE_PROT_X) + return -EINVAL; + attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); + break; + default: + attr = KVM_S2_MEMATTR(pgt, NORMAL); + } + if (!(prot & KVM_PGTABLE_PROT_X)) attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - else if (device) - return -EINVAL; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 8e1e0d5033b6..a8b9ea496706 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -95,7 +95,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) } /** - * __vcpu_put_switch_syregs - Restore host system registers to the physical CPU + * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU * * @vcpu: The VCPU pointer * diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 0bd93a5f21ce..a640e839848e 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -134,7 +134,7 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) { fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE; } else { - /* no need to shuffle FS[4] into DFSR[10] as its 0 */ + /* no need to shuffle FS[4] into DFSR[10] as it's 0 */ fsr = DFSR_FSC_EXTABT_nLPAE; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d14504821b79..2902f3ffca3c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -305,7 +305,7 @@ static void invalidate_icache_guest_page(void *va, size_t size) * does. */ /** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap @@ -1381,7 +1381,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault, mte_allowed; - bool device = false; + bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; @@ -1472,6 +1472,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn = fault_ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); + vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + /* Don't use the VMA after the unlock -- it may have vanished */ vma = NULL; @@ -1557,10 +1559,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) prot |= KVM_PGTABLE_PROT_X; - if (device) - prot |= KVM_PGTABLE_PROT_DEVICE; - else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) + if (device) { + if (vfio_allow_any_uc) + prot |= KVM_PGTABLE_PROT_NORMAL_NC; + else + prot |= KVM_PGTABLE_PROT_DEVICE; + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { prot |= KVM_PGTABLE_PROT_X; + } /* * Under the premise of getting a FSC_PERM fault, we just need to relax diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index ba95d044bc98..ced30c90521a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -133,6 +133,13 @@ static u64 limit_nv_id_reg(u32 id, u64 val) val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001); break; + case SYS_ID_AA64MMFR4_EL1: + val = 0; + if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + val |= FIELD_PREP(NV_FTR(MMFR4, E2H0), + ID_AA64MMFR4_EL1_E2H0_NI_NV1); + break; + case SYS_ID_AA64DFR0_EL1: /* Only limited support for PMU, Debug, BPs and WPs */ val &= (NV_FTR(DFR0, PMUVer) | @@ -156,15 +163,280 @@ static u64 limit_nv_id_reg(u32 id, u64 val) return val; } + +u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) +{ + u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr); + struct kvm_sysreg_masks *masks; + + masks = vcpu->kvm->arch.sysreg_masks; + + if (masks) { + sr -= __VNCR_START__; + + v &= ~masks->mask[sr].res0; + v |= masks->mask[sr].res1; + } + + return v; +} + +static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) +{ + int i = sr - __VNCR_START__; + + kvm->arch.sysreg_masks->mask[i].res0 = res0; + kvm->arch.sysreg_masks->mask[i].res1 = res1; +} + int kvm_init_nv_sysregs(struct kvm *kvm) { + u64 res0, res1; + int ret = 0; + mutex_lock(&kvm->arch.config_lock); + if (kvm->arch.sysreg_masks) + goto out; + + kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), + GFP_KERNEL); + if (!kvm->arch.sysreg_masks) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < KVM_ARM_ID_REG_NUM; i++) kvm->arch.id_regs[i] = limit_nv_id_reg(IDX_IDREG(i), kvm->arch.id_regs[i]); + /* VTTBR_EL2 */ + res0 = res1 = 0; + if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16)) + res0 |= GENMASK(63, 56); + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP)) + res0 |= VTTBR_CNP_BIT; + set_sysreg_masks(kvm, VTTBR_EL2, res0, res1); + + /* VTCR_EL2 */ + res0 = GENMASK(63, 32) | GENMASK(30, 20); + res1 = BIT(31); + set_sysreg_masks(kvm, VTCR_EL2, res0, res1); + + /* VMPIDR_EL2 */ + res0 = GENMASK(63, 40) | GENMASK(30, 24); + res1 = BIT(31); + set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1); + + /* HCR_EL2 */ + res0 = BIT(48); + res1 = HCR_RW; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, TWED, IMP)) + res0 |= GENMASK(63, 59); + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, MTE2)) + res0 |= (HCR_TID5 | HCR_DCT | HCR_ATA); + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, TTLBxS)) + res0 |= (HCR_TTLBIS | HCR_TTLBOS); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) && + !kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2)) + res0 |= HCR_ENSCXT; + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, IMP)) + res0 |= (HCR_TOCU | HCR_TICAB | HCR_TID4); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) + res0 |= HCR_AMVOFFEN; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1)) + res0 |= HCR_FIEN; + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP)) + res0 |= HCR_FWB; + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)) + res0 |= HCR_NV2; + if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) + res0 |= (HCR_AT | HCR_NV1 | HCR_NV); + if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + res0 |= (HCR_API | HCR_APK); + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP)) + res0 |= BIT(39); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) + res0 |= (HCR_TEA | HCR_TERR); + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + res0 |= HCR_TLOR; + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP)) + res1 |= HCR_E2H; + set_sysreg_masks(kvm, HCR_EL2, res0, res1); + + /* HCRX_EL2 */ + res0 = HCRX_EL2_RES0; + res1 = HCRX_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64ISAR3_EL1, PACM, TRIVIAL_IMP)) + res0 |= HCRX_EL2_PACMEn; + if (!kvm_has_feat(kvm, ID_AA64PFR2_EL1, FPMR, IMP)) + res0 |= HCRX_EL2_EnFPM; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) + res0 |= HCRX_EL2_GCSEn; + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, SYSREG_128, IMP)) + res0 |= HCRX_EL2_EnIDCP128; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, DEV_ASYNC)) + res0 |= (HCRX_EL2_EnSDERR | HCRX_EL2_EnSNERR); + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, DF2, IMP)) + res0 |= HCRX_EL2_TMEA; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP)) + res0 |= HCRX_EL2_D128En; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) + res0 |= HCRX_EL2_PTTWI; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP)) + res0 |= HCRX_EL2_SCTLR2En; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + res0 |= HCRX_EL2_TCR2En; + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) + res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, CMOW, IMP)) + res0 |= HCRX_EL2_CMOW; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, NMI, IMP)) + res0 |= (HCRX_EL2_VFNMI | HCRX_EL2_VINMI | HCRX_EL2_TALLINT); + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP) || + !(read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS)) + res0 |= HCRX_EL2_SMPME; + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) + res0 |= (HCRX_EL2_FGTnXS | HCRX_EL2_FnXS); + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V)) + res0 |= HCRX_EL2_EnASR; + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64)) + res0 |= HCRX_EL2_EnALS; + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) + res0 |= HCRX_EL2_EnAS0; + set_sysreg_masks(kvm, HCRX_EL2, res0, res1); + + /* HFG[RW]TR_EL2 */ + res0 = res1 = 0; + if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey | + HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey | + HFGxTR_EL2_APIBKey); + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + res0 |= (HFGxTR_EL2_LORC_EL1 | HFGxTR_EL2_LOREA_EL1 | + HFGxTR_EL2_LORID_EL1 | HFGxTR_EL2_LORN_EL1 | + HFGxTR_EL2_LORSA_EL1); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) && + !kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2)) + res0 |= (HFGxTR_EL2_SCXTNUM_EL1 | HFGxTR_EL2_SCXTNUM_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP)) + res0 |= HFGxTR_EL2_ICC_IGRPENn_EL1; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) + res0 |= (HFGxTR_EL2_ERRIDR_EL1 | HFGxTR_EL2_ERRSELR_EL1 | + HFGxTR_EL2_ERXFR_EL1 | HFGxTR_EL2_ERXCTLR_EL1 | + HFGxTR_EL2_ERXSTATUS_EL1 | HFGxTR_EL2_ERXMISCn_EL1 | + HFGxTR_EL2_ERXPFGF_EL1 | HFGxTR_EL2_ERXPFGCTL_EL1 | + HFGxTR_EL2_ERXPFGCDN_EL1 | HFGxTR_EL2_ERXADDR_EL1); + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) + res0 |= HFGxTR_EL2_nACCDATA_EL1; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) + res0 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) + res0 |= HFGxTR_EL2_nRCWMASK_EL1; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + res0 |= HFGxTR_EL2_nS2POR_EL1; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) + res0 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); + set_sysreg_masks(kvm, HFGRTR_EL2, res0 | __HFGRTR_EL2_RES0, res1); + set_sysreg_masks(kvm, HFGWTR_EL2, res0 | __HFGWTR_EL2_RES0, res1); + + /* HDFG[RW]TR_EL2 */ + res0 = res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) + res0 |= HDFGRTR_EL2_OSDLR_EL1; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) + res0 |= (HDFGRTR_EL2_PMEVCNTRn_EL0 | HDFGRTR_EL2_PMEVTYPERn_EL0 | + HDFGRTR_EL2_PMCCFILTR_EL0 | HDFGRTR_EL2_PMCCNTR_EL0 | + HDFGRTR_EL2_PMCNTEN | HDFGRTR_EL2_PMINTEN | + HDFGRTR_EL2_PMOVS | HDFGRTR_EL2_PMSELR_EL0 | + HDFGRTR_EL2_PMMIR_EL1 | HDFGRTR_EL2_PMUSERENR_EL0 | + HDFGRTR_EL2_PMCEIDn_EL0); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) + res0 |= (HDFGRTR_EL2_PMBLIMITR_EL1 | HDFGRTR_EL2_PMBPTR_EL1 | + HDFGRTR_EL2_PMBSR_EL1 | HDFGRTR_EL2_PMSCR_EL1 | + HDFGRTR_EL2_PMSEVFR_EL1 | HDFGRTR_EL2_PMSFCR_EL1 | + HDFGRTR_EL2_PMSICR_EL1 | HDFGRTR_EL2_PMSIDR_EL1 | + HDFGRTR_EL2_PMSIRR_EL1 | HDFGRTR_EL2_PMSLATFR_EL1 | + HDFGRTR_EL2_PMBIDR_EL1); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) + res0 |= (HDFGRTR_EL2_TRC | HDFGRTR_EL2_TRCAUTHSTATUS | + HDFGRTR_EL2_TRCAUXCTLR | HDFGRTR_EL2_TRCCLAIM | + HDFGRTR_EL2_TRCCNTVRn | HDFGRTR_EL2_TRCID | + HDFGRTR_EL2_TRCIMSPECn | HDFGRTR_EL2_TRCOSLSR | + HDFGRTR_EL2_TRCPRGCTLR | HDFGRTR_EL2_TRCSEQSTR | + HDFGRTR_EL2_TRCSSCSRn | HDFGRTR_EL2_TRCSTATR | + HDFGRTR_EL2_TRCVICTLR); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) + res0 |= (HDFGRTR_EL2_TRBBASER_EL1 | HDFGRTR_EL2_TRBIDR_EL1 | + HDFGRTR_EL2_TRBLIMITR_EL1 | HDFGRTR_EL2_TRBMAR_EL1 | + HDFGRTR_EL2_TRBPTR_EL1 | HDFGRTR_EL2_TRBSR_EL1 | + HDFGRTR_EL2_TRBTRG_EL1); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) + res0 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBDATA); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) + res0 |= HDFGRTR_EL2_nPMSNEVFR_EL1; + set_sysreg_masks(kvm, HDFGRTR_EL2, res0 | HDFGRTR_EL2_RES0, res1); + + /* Reuse the bits from the read-side and add the write-specific stuff */ + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) + res0 |= (HDFGWTR_EL2_PMCR_EL0 | HDFGWTR_EL2_PMSWINC_EL0); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) + res0 |= HDFGWTR_EL2_TRCOSLAR; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + res0 |= HDFGWTR_EL2_TRFCR_EL1; + set_sysreg_masks(kvm, HFGWTR_EL2, res0 | HDFGWTR_EL2_RES0, res1); + + /* HFGITR_EL2 */ + res0 = HFGITR_EL2_RES0; + res1 = HFGITR_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, DPB, DPB2)) + res0 |= HFGITR_EL2_DCCVADP; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) + res0 |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP); + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + res0 |= (HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS | + HFGITR_EL2_TLBIVAALE1OS | HFGITR_EL2_TLBIVALE1OS | + HFGITR_EL2_TLBIVAAE1OS | HFGITR_EL2_TLBIASIDE1OS | + HFGITR_EL2_TLBIVAE1OS | HFGITR_EL2_TLBIVMALLE1OS); + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + res0 |= (HFGITR_EL2_TLBIRVAALE1 | HFGITR_EL2_TLBIRVALE1 | + HFGITR_EL2_TLBIRVAAE1 | HFGITR_EL2_TLBIRVAE1 | + HFGITR_EL2_TLBIRVAALE1IS | HFGITR_EL2_TLBIRVALE1IS | + HFGITR_EL2_TLBIRVAAE1IS | HFGITR_EL2_TLBIRVAE1IS | + HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, IMP)) + res0 |= (HFGITR_EL2_CFPRCTX | HFGITR_EL2_DVPRCTX | + HFGITR_EL2_CPPRCTX); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) + res0 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) + res0 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | + HFGITR_EL2_nGCSEPP); + if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX)) + res0 |= HFGITR_EL2_COSPRCTX; + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) + res0 |= HFGITR_EL2_ATS1E1A; + set_sysreg_masks(kvm, HFGITR_EL2, res0, res1); + + /* HAFGRTR_EL2 - not a lot to see here */ + res0 = HAFGRTR_EL2_RES0; + res1 = HAFGRTR_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) + res0 |= ~(res0 | res1); + set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); +out: mutex_unlock(&kvm->arch.config_lock); - return 0; + return ret; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 3d9467ff73bc..a35ce10e0a9f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -64,12 +64,11 @@ u64 kvm_pmu_evtyper_mask(struct kvm *kvm) { u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | kvm_pmu_event_mask(kvm); - u64 pfr0 = IDREG(kvm, SYS_ID_AA64PFR0_EL1); - if (SYS_FIELD_GET(ID_AA64PFR0_EL1, EL2, pfr0)) + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) mask |= ARMV8_PMU_INCLUDE_EL2; - if (SYS_FIELD_GET(ID_AA64PFR0_EL1, EL3, pfr0)) + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | ARMV8_PMU_EXCLUDE_NS_EL1 | ARMV8_PMU_EXCLUDE_EL3; @@ -83,8 +82,10 @@ u64 kvm_pmu_evtyper_mask(struct kvm *kvm) */ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) { + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + return (pmc->idx == ARMV8_PMU_CYCLE_IDX || - kvm_pmu_is_3p5(kvm_pmc_to_vcpu(pmc))); + kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)); } static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) @@ -419,7 +420,7 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) kvm_pmu_update_state(vcpu); } -/** +/* * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding * to the event. * This is why we need a callback to do it once outside of the NMI context. @@ -490,7 +491,7 @@ static u64 compute_period(struct kvm_pmc *pmc, u64 counter) return val; } -/** +/* * When the perf event overflows, set the overflow status and inform the vcpu. */ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, @@ -556,7 +557,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) return; /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ - if (!kvm_pmu_is_3p5(vcpu)) + if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; /* The reset bits don't indicate any state, and shouldn't be saved. */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 30253bd19917..8e60aa4a8dfb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -12,6 +12,7 @@ #include <linux/bitfield.h> #include <linux/bsearch.h> #include <linux/cacheinfo.h> +#include <linux/debugfs.h> #include <linux/kvm_host.h> #include <linux/mm.h> #include <linux/printk.h> @@ -31,6 +32,7 @@ #include <trace/events/kvm.h> +#include "check-res-bits.h" #include "sys_regs.h" #include "trace.h" @@ -505,10 +507,9 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 val = IDREG(vcpu->kvm, SYS_ID_AA64MMFR1_EL1); u32 sr = reg_to_encoding(r); - if (!(val & (0xfUL << ID_AA64MMFR1_EL1_LO_SHIFT))) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { kvm_inject_undefined(vcpu); return false; } @@ -1685,7 +1686,8 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ (val) &= ~reg##_##field##_MASK; \ (val) |= FIELD_PREP(reg##_##field##_MASK, \ - min(__f_val, (u64)reg##_##field##_##limit)); \ + min(__f_val, \ + (u64)SYS_FIELD_VALUE(reg, field, limit))); \ (val); \ }) @@ -2174,6 +2176,16 @@ static bool access_spsr(struct kvm_vcpu *vcpu, return true; } +static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 val = r->val; + + if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) + val |= HCR_E2H; + + return __vcpu_sys_reg(vcpu, r->reg) = val; +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2186,16 +2198,6 @@ static bool access_spsr(struct kvm_vcpu *vcpu, * guest... */ static const struct sys_reg_desc sys_reg_descs[] = { - { SYS_DESC(SYS_DC_ISW), access_dcsw }, - { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, - { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, - { SYS_DESC(SYS_DC_CSW), access_dcsw }, - { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, - { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, - { SYS_DESC(SYS_DC_CISW), access_dcsw }, - { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, - { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, - DBG_BCR_BVR_WCR_WVR_EL1(0), DBG_BCR_BVR_WCR_WVR_EL1(1), { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, @@ -2349,7 +2351,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), ID_SANITISED(ID_AA64MMFR3_EL1), - ID_UNALLOCATED(7,4), + ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), ID_UNALLOCATED(7,6), ID_UNALLOCATED(7,7), @@ -2665,7 +2667,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0), EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), - EL2_REG_VNCR(HCR_EL2, reset_val, 0), + EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), EL2_REG(MDCR_EL2, access_rw, reset_val, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), @@ -2727,6 +2729,18 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static struct sys_reg_desc sys_insn_descs[] = { + { SYS_DESC(SYS_DC_ISW), access_dcsw }, + { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CSW), access_dcsw }, + { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CISW), access_dcsw }, + { SYS_DESC(SYS_DC_CIGSW), access_dcgsw }, + { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw }, +}; + static const struct sys_reg_desc *first_idreg; static bool trap_dbgdidr(struct kvm_vcpu *vcpu, @@ -2737,8 +2751,7 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, return ignore_write(vcpu, p); } else { u64 dfr = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); - u64 pfr = IDREG(vcpu->kvm, SYS_ID_AA64PFR0_EL1); - u32 el3 = !!SYS_FIELD_GET(ID_AA64PFR0_EL1, EL3, pfr); + u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP); p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) | (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) | @@ -3159,7 +3172,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, /** * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer - * @run: The kvm_run struct + * @global: &struct sys_reg_desc + * @nr_global: size of the @global array */ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, const struct sys_reg_desc *global, @@ -3326,7 +3340,9 @@ static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu, /** * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access * @vcpu: The VCPU pointer - * @run: The kvm_run struct + * @params: &struct sys_reg_params + * @global: &struct sys_reg_desc + * @nr_global: size of the @global array */ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, struct sys_reg_params *params, @@ -3384,12 +3400,6 @@ int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) return kvm_handle_cp_32(vcpu, ¶ms, cp14_regs, ARRAY_SIZE(cp14_regs)); } -static bool is_imp_def_sys_reg(struct sys_reg_params *params) -{ - // See ARM DDI 0487E.a, section D12.3.2 - return params->Op0 == 3 && (params->CRn & 0b1011) == 0b1011; -} - /** * emulate_sys_reg - Emulate a guest access to an AArch64 system register * @vcpu: The VCPU pointer @@ -3398,26 +3408,106 @@ static bool is_imp_def_sys_reg(struct sys_reg_params *params) * Return: true if the system register access was successful, false otherwise. */ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *params) + struct sys_reg_params *params) { const struct sys_reg_desc *r; r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); - if (likely(r)) { perform_access(vcpu, params, r); return true; } - if (is_imp_def_sys_reg(params)) { - kvm_inject_undefined(vcpu); + print_sys_reg_msg(params, + "Unsupported guest sys_reg access at: %lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); + kvm_inject_undefined(vcpu); + + return false; +} + +static void *idregs_debug_start(struct seq_file *s, loff_t *pos) +{ + struct kvm *kvm = s->private; + u8 *iter; + + mutex_lock(&kvm->arch.config_lock); + + iter = &kvm->arch.idreg_debugfs_iter; + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) && + *iter == (u8)~0) { + *iter = *pos; + if (*iter >= KVM_ARM_ID_REG_NUM) + iter = NULL; } else { - print_sys_reg_msg(params, - "Unsupported guest sys_reg access at: %lx [%08lx]\n", - *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); - kvm_inject_undefined(vcpu); + iter = ERR_PTR(-EBUSY); } - return false; + + mutex_unlock(&kvm->arch.config_lock); + + return iter; +} + +static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kvm *kvm = s->private; + + (*pos)++; + + if ((kvm->arch.idreg_debugfs_iter + 1) < KVM_ARM_ID_REG_NUM) { + kvm->arch.idreg_debugfs_iter++; + + return &kvm->arch.idreg_debugfs_iter; + } + + return NULL; +} + +static void idregs_debug_stop(struct seq_file *s, void *v) +{ + struct kvm *kvm = s->private; + + if (IS_ERR(v)) + return; + + mutex_lock(&kvm->arch.config_lock); + + kvm->arch.idreg_debugfs_iter = ~0; + + mutex_unlock(&kvm->arch.config_lock); +} + +static int idregs_debug_show(struct seq_file *s, void *v) +{ + struct kvm *kvm = s->private; + const struct sys_reg_desc *desc; + + desc = first_idreg + kvm->arch.idreg_debugfs_iter; + + if (!desc->name) + return 0; + + seq_printf(s, "%20s:\t%016llx\n", + desc->name, IDREG(kvm, IDX_IDREG(kvm->arch.idreg_debugfs_iter))); + + return 0; +} + +static const struct seq_operations idregs_debug_sops = { + .start = idregs_debug_start, + .next = idregs_debug_next, + .stop = idregs_debug_stop, + .show = idregs_debug_show, +}; + +DEFINE_SEQ_ATTRIBUTE(idregs_debug); + +void kvm_sys_regs_create_debugfs(struct kvm *kvm) +{ + kvm->arch.idreg_debugfs_iter = ~0; + + debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm, + &idregs_debug_fops); } static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) @@ -3467,28 +3557,39 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) } /** - * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access + * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction + * trap on a guest execution * @vcpu: The VCPU pointer */ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { + const struct sys_reg_desc *desc = NULL; struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_esr(vcpu); int Rt = kvm_vcpu_sys_get_rt(vcpu); + int sr_idx; trace_kvm_handle_sys_reg(esr); - if (__check_nv_sr_forward(vcpu)) + if (triage_sysreg_trap(vcpu, &sr_idx)) return 1; params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - if (!emulate_sys_reg(vcpu, ¶ms)) - return 1; + /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */ + if (params.Op0 == 2 || params.Op0 == 3) + desc = &sys_reg_descs[sr_idx]; + else + desc = &sys_insn_descs[sr_idx]; - if (!params.is_write) + perform_access(vcpu, ¶ms, desc); + + /* Read from system register? */ + if (!params.is_write && + (params.Op0 == 2 || params.Op0 == 3)) vcpu_set_reg(vcpu, Rt, params.regval); + return 1; } @@ -3930,11 +4031,86 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range * return 0; } +void kvm_init_sysreg(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + + /* + * In the absence of FGT, we cannot independently trap TLBI + * Range instructions. This isn't great, but trapping all + * TLBIs would be far worse. Live with it... + */ + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + vcpu->arch.hcr_el2 |= HCR_TTLBOS; + + if (cpus_have_final_cap(ARM64_HAS_HCX)) { + vcpu->arch.hcrx_el2 = HCRX_GUEST_FLAGS; + + if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) + vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); + } + + if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) + goto out; + + kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | + HFGxTR_EL2_nMAIR2_EL1 | + HFGxTR_EL2_nS2POR_EL1 | + HFGxTR_EL2_nPOR_EL1 | + HFGxTR_EL2_nPOR_EL0 | + HFGxTR_EL2_nACCDATA_EL1 | + HFGxTR_EL2_nSMPRI_EL1_MASK | + HFGxTR_EL2_nTPIDR2_EL0_MASK); + + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1OS| + HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | + HFGITR_EL2_TLBIRVAE1OS | + HFGITR_EL2_TLBIVAALE1OS | + HFGITR_EL2_TLBIVALE1OS | + HFGITR_EL2_TLBIVAAE1OS | + HFGITR_EL2_TLBIASIDE1OS | + HFGITR_EL2_TLBIVAE1OS | + HFGITR_EL2_TLBIVMALLE1OS); + + if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1 | + HFGITR_EL2_TLBIRVALE1 | + HFGITR_EL2_TLBIRVAAE1 | + HFGITR_EL2_TLBIRVAE1 | + HFGITR_EL2_TLBIRVAALE1IS| + HFGITR_EL2_TLBIRVALE1IS | + HFGITR_EL2_TLBIRVAAE1IS | + HFGITR_EL2_TLBIRVAE1IS | + HFGITR_EL2_TLBIRVAALE1OS| + HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | + HFGITR_EL2_TLBIRVAE1OS); + + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | + HFGxTR_EL2_nPIR_EL1); + + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) + kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | + HAFGRTR_EL2_RES1); + + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); +out: + mutex_unlock(&kvm->arch.config_lock); +} + int __init kvm_sys_reg_table_init(void) { struct sys_reg_params params; bool valid = true; unsigned int i; + int ret = 0; + + check_res_bits(); /* Make sure tables are unique and in order. */ valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false); @@ -3943,6 +4119,7 @@ int __init kvm_sys_reg_table_init(void) valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true); valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true); valid &= check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false); + valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false); if (!valid) return -EINVAL; @@ -3957,8 +4134,13 @@ int __init kvm_sys_reg_table_init(void) if (!first_idreg) return -EINVAL; - if (kvm_get_mode() == KVM_MODE_NV) - return populate_nv_trap_config(); + ret = populate_nv_trap_config(); - return 0; + for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) + ret = populate_sysreg_config(sys_reg_descs + i, i); + + for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++) + ret = populate_sysreg_config(sys_insn_descs + i, i); + + return ret; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index c65c129b3500..997eea21ba2a 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -233,6 +233,8 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); +bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 85606a531dc3..389025ce7749 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -149,7 +149,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist) seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2"); seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis); if (v3) - seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count); + seq_printf(s, "nr_lpis:\t%d\n", atomic_read(&dist->lpi_count)); seq_printf(s, "enabled:\t%d\n", dist->enabled); seq_printf(s, "\n"); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index e949e1d0fd9f..f20941f83a07 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -53,9 +53,9 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - INIT_LIST_HEAD(&dist->lpi_list_head); INIT_LIST_HEAD(&dist->lpi_translation_cache); raw_spin_lock_init(&dist->lpi_list_lock); + xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); } /* CREATION */ @@ -309,7 +309,7 @@ int vgic_init(struct kvm *kvm) vgic_lpi_translation_cache_init(kvm); /* - * If we have GICv4.1 enabled, unconditionnaly request enable the + * If we have GICv4.1 enabled, unconditionally request enable the * v4 support so that we get HW-accelerated vSGIs. Otherwise, only * enable it if we present a virtual ITS to the guest. */ @@ -366,6 +366,8 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) if (vgic_supports_direct_msis(kvm)) vgic_v4_teardown(kvm); + + xa_destroy(&dist->lpi_xa); } static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -445,13 +447,15 @@ int vgic_lazy_init(struct kvm *kvm) /* RESOURCE MAPPING */ /** + * kvm_vgic_map_resources - map the MMIO regions + * @kvm: kvm struct pointer + * * Map the MMIO regions depending on the VGIC model exposed to the guest * called on the first VCPU run. * Also map the virtual CPU interface into the VM. * v2 calls vgic_init() if not already done. * v3 and derivatives return an error if the VGIC is not initialized. * vgic_ready() returns true if this function has succeeded. - * @kvm: kvm struct pointer */ int kvm_vgic_map_resources(struct kvm *kvm) { diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 28a93074eca1..e85a495ada9c 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -52,7 +52,12 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&irq->lpi_list); + ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + if (ret) { + kfree(irq); + return ERR_PTR(ret); + } + INIT_LIST_HEAD(&irq->ap_list); raw_spin_lock_init(&irq->irq_lock); @@ -68,30 +73,30 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, * There could be a race with another vgic_add_lpi(), so we need to * check that we don't add a second list entry with the same LPI. */ - list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { - if (oldirq->intid != intid) - continue; - + oldirq = xa_load(&dist->lpi_xa, intid); + if (vgic_try_get_irq_kref(oldirq)) { /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() on the returned pointer once it's - * finished with the IRQ. - */ - vgic_get_irq_kref(irq); + goto out_unlock; + } + ret = xa_err(xa_store(&dist->lpi_xa, intid, irq, 0)); + if (ret) { + xa_release(&dist->lpi_xa, intid); + kfree(irq); goto out_unlock; } - list_add_tail(&irq->lpi_list, &dist->lpi_list_head); - dist->lpi_list_count++; + atomic_inc(&dist->lpi_count); out_unlock: raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + if (ret) + return ERR_PTR(ret); + /* * We "cache" the configuration table entries in our struct vgic_irq's. * However we only have those structs for mapped IRQs, so we read in @@ -158,7 +163,7 @@ struct vgic_translation_cache_entry { * @cte_esz: collection table entry size * @dte_esz: device table entry size * @ite_esz: interrupt translation table entry size - * @save tables: save the ITS tables into guest RAM + * @save_tables: save the ITS tables into guest RAM * @restore_tables: restore the ITS internal structs from tables * stored in guest RAM * @commit: initialize the registers which expose the ABI settings, @@ -311,6 +316,8 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, return 0; } +#define GIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) + /* * Create a snapshot of the current LPIs targeting @vcpu, so that we can * enumerate those LPIs without holding any lock. @@ -319,6 +326,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) { struct vgic_dist *dist = &kvm->arch.vgic; + XA_STATE(xas, &dist->lpi_xa, GIC_LPI_OFFSET); struct vgic_irq *irq; unsigned long flags; u32 *intids; @@ -331,13 +339,15 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) * command). If coming from another path (such as enabling LPIs), * we must be careful not to overrun the array. */ - irq_count = READ_ONCE(dist->lpi_list_count); + irq_count = atomic_read(&dist->lpi_count); intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); if (!intids) return -ENOMEM; raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + rcu_read_lock(); + + xas_for_each(&xas, irq, GIC_LPI_MAX_INTID) { if (i == irq_count) break; /* We don't need to "get" the IRQ, as we hold the list lock. */ @@ -345,6 +355,8 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) continue; intids[i++] = irq->intid; } + + rcu_read_unlock(); raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); *intid_ptr = intids; @@ -595,8 +607,8 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); irq = __vgic_its_check_cache(dist, db, devid, eventid); - if (irq) - vgic_get_irq_kref(irq); + if (!vgic_try_get_irq_kref(irq)) + irq = NULL; raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); @@ -640,8 +652,13 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, * was in the cache, and increment it on the new interrupt. */ if (cte->irq) - __vgic_put_lpi_locked(kvm, cte->irq); + vgic_put_irq(kvm, cte->irq); + /* + * The irq refcount is guaranteed to be nonzero while holding the + * its_lock, as the ITE (and the reference it holds) cannot be freed. + */ + lockdep_assert_held(&its->its_lock); vgic_get_irq_kref(irq); cte->db = db; @@ -672,7 +689,7 @@ void vgic_its_invalidate_cache(struct kvm *kvm) if (!cte->irq) break; - __vgic_put_lpi_locked(kvm, cte->irq); + vgic_put_irq(kvm, cte->irq); cte->irq = NULL; } @@ -1345,8 +1362,8 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, } /** - * vgic_its_invall - invalidate all LPIs targetting a given vcpu - * @vcpu: the vcpu for which the RD is targetted by an invalidation + * vgic_its_invall - invalidate all LPIs targeting a given vcpu + * @vcpu: the vcpu for which the RD is targeted by an invalidation * * Contrary to the INVALL command, this targets a RD instead of a * collection, and we don't need to hold the its_lock, since no ITS is @@ -2144,7 +2161,7 @@ static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite) } /** - * entry_fn_t - Callback called on a table entry restore path + * typedef entry_fn_t - Callback called on a table entry restore path * @its: its handle * @id: id of the entry * @entry: pointer to the entry diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 9465d3706ab9..4ea3340786b9 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -380,6 +380,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; bool vlpi_avail = false; + unsigned long index; int ret = 0; u8 val; @@ -396,7 +397,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) vlpi_avail = true; } - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + xa_for_each(&dist->lpi_xa, index, irq) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index db2a95762b1b..4ec93587c8cd 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -30,7 +30,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * its->its_lock (mutex) * vgic_cpu->ap_list_lock must be taken with IRQs disabled * kvm->lpi_list_lock must be taken with IRQs disabled - * vgic_irq->irq_lock must be taken with IRQs disabled + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled + * vgic_irq->irq_lock must be taken with IRQs disabled * * As the ap_list_lock might be taken from the timer interrupt handler, * we have to disable IRQs before taking this lock and everything lower @@ -54,32 +55,22 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { */ /* - * Iterate over the VM's list of mapped LPIs to find the one with a - * matching interrupt ID and return a reference to the IRQ structure. + * Index the VM's xarray of mapped LPIs and return a reference to the IRQ + * structure. The caller is expected to call vgic_put_irq() later once it's + * finished with the IRQ. */ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - if (irq->intid != intid) - continue; + rcu_read_lock(); - /* - * This increases the refcount, the caller is expected to - * call vgic_put_irq() later once it's finished with the IRQ. - */ - vgic_get_irq_kref(irq); - goto out_unlock; - } - irq = NULL; + irq = xa_load(&dist->lpi_xa, intid); + if (!vgic_try_get_irq_kref(irq)) + irq = NULL; -out_unlock: - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + rcu_read_unlock(); return irq; } @@ -120,22 +111,6 @@ static void vgic_irq_release(struct kref *ref) { } -/* - * Drop the refcount on the LPI. Must be called with lpi_list_lock held. - */ -void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - if (!kref_put(&irq->refcount, vgic_irq_release)) - return; - - list_del(&irq->lpi_list); - dist->lpi_list_count--; - - kfree(irq); -} - void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -144,9 +119,15 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return; - raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); - __vgic_put_lpi_locked(kvm, irq); - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + if (!kref_put(&irq->refcount, vgic_irq_release)) + return; + + xa_lock_irqsave(&dist->lpi_xa, flags); + __xa_erase(&dist->lpi_xa, irq->intid); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + atomic_dec(&dist->lpi_count); + kfree_rcu(irq, rcu); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) @@ -203,7 +184,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) } /** - * kvm_vgic_target_oracle - compute the target vcpu for an irq + * vgic_target_oracle - compute the target vcpu for an irq * * @irq: The irq to route. Must be already locked. * @@ -404,7 +385,8 @@ retry: /* * Grab a reference to the irq to reflect the fact that it is - * now in the ap_list. + * now in the ap_list. This is safe as the caller must already hold a + * reference on the irq. */ vgic_get_irq_kref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 8d134569d0a1..0c2b82de8fa3 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -180,7 +180,6 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len); struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); -void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); @@ -220,12 +219,20 @@ void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); -static inline void vgic_get_irq_kref(struct vgic_irq *irq) +static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) { + if (!irq) + return false; + if (irq->intid < VGIC_MIN_LPI) - return; + return true; - kref_get(&irq->refcount); + return kref_get_unless_zero(&irq->refcount); +} + +static inline void vgic_get_irq_kref(struct vgic_irq *irq) +{ + WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index b912b1409fc0..65090dd34641 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -35,6 +35,7 @@ HAS_GENERIC_AUTH_IMP_DEF HAS_GIC_CPUIF_SYSREGS HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC +HAS_HCR_NV1 HAS_HCX HAS_LDAPR HAS_LPA2 diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 4c9b67934367..53daaaef46cb 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1366,6 +1366,7 @@ EndEnum UnsignedEnum 43:40 SPECRES 0b0000 NI 0b0001 IMP + 0b0010 COSP_RCTX EndEnum UnsignedEnum 39:36 SB 0b0000 NI @@ -1492,7 +1493,12 @@ EndEnum EndSysreg Sysreg ID_AA64ISAR3_EL1 3 0 0 6 3 -Res0 63:12 +Res0 63:16 +UnsignedEnum 15:12 PACM + 0b0000 NI + 0b0001 TRIVIAL_IMP + 0b0010 FULL_IMP +EndEnum UnsignedEnum 11:8 TLBIW 0b0000 NI 0b0001 IMP @@ -1791,6 +1797,43 @@ UnsignedEnum 3:0 TCRX EndEnum EndSysreg +Sysreg ID_AA64MMFR4_EL1 3 0 0 7 4 +Res0 63:40 +UnsignedEnum 39:36 E3DSE + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 35:28 +SignedEnum 27:24 E2H0 + 0b0000 IMP + 0b1110 NI_NV1 + 0b1111 NI +EndEnum +UnsignedEnum 23:20 NV_frac + 0b0000 NV_NV2 + 0b0001 NV2_ONLY +EndEnum +UnsignedEnum 19:16 FGWTE3 + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 15:12 HACDBS + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 11:8 ASID2 + 0b0000 NI + 0b0001 IMP +EndEnum +SignedEnum 7:4 EIESB + 0b0000 NI + 0b0001 ToEL3 + 0b0010 ToELx + 0b1111 ANY +EndEnum +Res0 3:0 +EndSysreg + Sysreg SCTLR_EL1 3 0 1 0 0 Field 63 TIDCP Field 62 SPINTMASK diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 23407fbd73c9..d32abe7fe6ab 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2538,9 +2538,8 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ vcpu->kvm->arch.kvm_ops->create_vcpu_debugfs(vcpu, debugfs_dentry); } -int kvm_arch_create_vm_debugfs(struct kvm *kvm) +void kvm_arch_create_vm_debugfs(struct kvm *kvm) { if (kvm->arch.kvm_ops->create_vm_debugfs) kvm->arch.kvm_ops->create_vm_debugfs(kvm); - return 0; } diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95ea1a1f7403..999227fc7c66 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -189,9 +189,8 @@ static const struct file_operations mmu_rmaps_stat_fops = { .release = kvm_mmu_rmaps_stat_release, }; -int kvm_arch_create_vm_debugfs(struct kvm *kvm) +void kvm_arch_create_vm_debugfs(struct kvm *kvm) { debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, &mmu_rmaps_stat_fops); - return 0; } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1cbc990d42e0..df6f99bdf70d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1862,8 +1862,25 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma /* * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. + * + * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, + * allowing KVM stage 2 device mapping attributes to use Normal-NC + * rather than DEVICE_nGnRE, which allows guest mappings + * supporting write-combining attributes (WC). ARM does not + * architecturally guarantee this is safe, and indeed some MMIO + * regions like the GICv2 VCPU interface can trigger uncontained + * faults if Normal-NC is used. + * + * To safely use VFIO in KVM the platform must guarantee full + * safety in the guest where no action taken against a MMIO + * mapping can trigger an uncontained failure. The assumption is + * that most VFIO PCI platforms support this for both mapping types, + * at least in common flows, based on some expectations of how + * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in + * the VMA flags. */ - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vfio_pci_mmap_ops; return 0; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 4b9d8fb393a8..eb4c369a79eb 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -90,16 +90,6 @@ void kvm_vcpu_pmu_resync_el0(void); vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) -/* - * Evaluates as true when emulating PMUv3p5, and false otherwise. - */ -#define kvm_pmu_is_3p5(vcpu) ({ \ - u64 val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); \ - u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); \ - \ - pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; \ -}) - u8 kvm_arm_pmu_get_pmuver_limit(void); u64 kvm_pmu_evtyper_mask(struct kvm *kvm); int kvm_arm_set_default_pmu(struct kvm *kvm); @@ -168,7 +158,6 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) } #define kvm_vcpu_has_pmu(vcpu) ({ false; }) -#define kvm_pmu_is_3p5(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 8cc38e836f54..47035946648e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/static_key.h> #include <linux/types.h> +#include <linux/xarray.h> #include <kvm/iodev.h> #include <linux/list.h> #include <linux/jump_label.h> @@ -116,7 +117,7 @@ struct irq_ops { struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ - struct list_head lpi_list; /* Used to link all LPIs together */ + struct rcu_head rcu; struct list_head ap_list; struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU @@ -273,10 +274,10 @@ struct vgic_dist { */ u64 propbaser; - /* Protects the lpi_list and the count value below. */ + /* Protects the lpi_list. */ raw_spinlock_t lpi_list_lock; - struct list_head lpi_list_head; - int lpi_list_count; + struct xarray lpi_xa; + atomic_t lpi_count; /* LPI translation cache */ struct list_head lpi_translation_cache; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e7fd25b09b3..9a45f673f687 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1507,7 +1507,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); -int kvm_arch_create_vm_debugfs(struct kvm *kvm); +void kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..59576e56c58b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -391,6 +391,20 @@ extern unsigned int kobjsize(const void *objp); # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +/* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED_BIT 39 +#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 2cb8dd1f8275..8d6b923c6c23 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -158,9 +158,9 @@ static void guest_validate_irq(unsigned int intid, /* Basic 'timer condition met' check */ __GUEST_ASSERT(xcnt >= cval, - "xcnt = 0x%llx, cval = 0x%llx, xcnt_diff_us = 0x%llx", + "xcnt = 0x%lx, cval = 0x%lx, xcnt_diff_us = 0x%lx", xcnt, cval, xcnt_diff_us); - __GUEST_ASSERT(xctl & CTL_ISTATUS, "xcnt = 0x%llx", xcnt); + __GUEST_ASSERT(xctl & CTL_ISTATUS, "xctl = 0x%lx", xctl); WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); } diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c index 866002917441..2582c49e525a 100644 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -365,7 +365,7 @@ static void guest_wp_handler(struct ex_regs *regs) static void guest_ss_handler(struct ex_regs *regs) { - __GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%u'", ss_idx); + __GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%lu'", ss_idx); ss_addr[ss_idx++] = regs->pc; regs->pstate |= SPSR_SS; } diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c index 27c10e7a7e01..9d192ce0078d 100644 --- a/tools/testing/selftests/kvm/aarch64/hypercalls.c +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -105,12 +105,12 @@ static void guest_test_hvc(const struct test_hvc_info *hc_info) case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: case TEST_STAGE_HVC_IFACE_FALSE_INFO: __GUEST_ASSERT(res.a0 == SMCCC_RET_NOT_SUPPORTED, - "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%llx, stage = %u", + "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u", res.a0, hc_info->func_id, hc_info->arg1, stage); break; case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: __GUEST_ASSERT(res.a0 != SMCCC_RET_NOT_SUPPORTED, - "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%llx, stage = %u", + "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u", res.a0, hc_info->func_id, hc_info->arg1, stage); break; default: diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 53fddad57cbb..5972905275cf 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -292,7 +292,7 @@ static void guest_code(struct test_desc *test) static void no_dabt_handler(struct ex_regs *regs) { - GUEST_FAIL("Unexpected dabt, far_el1 = 0x%llx", read_sysreg(far_el1)); + GUEST_FAIL("Unexpected dabt, far_el1 = 0x%lx", read_sysreg(far_el1)); } static void no_iabt_handler(struct ex_regs *regs) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index bac05210b539..16e2338686c1 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -32,6 +32,10 @@ struct reg_ftr_bits { enum ftr_type type; uint8_t shift; uint64_t mask; + /* + * For FTR_EXACT, safe_val is used as the exact safe value. + * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value. + */ int64_t safe_val; }; @@ -65,13 +69,13 @@ struct test_feature_reg { static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = { S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0), - REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP), REG_FTR_END, }; static const struct reg_ftr_bits ftr_id_dfr0_el1[] = { - S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, 0), - REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, 0), + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, ID_DFR0_EL1_PerfMon_PMUv3), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, ID_DFR0_EL1_CopDbg_Armv8), REG_FTR_END, }; @@ -224,13 +228,13 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); - if (ftr_bits->type == FTR_UNSIGNED) { + if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { case FTR_EXACT: ftr = ftr_bits->safe_val; break; case FTR_LOWER_SAFE: - if (ftr > 0) + if (ftr > ftr_bits->safe_val) ftr--; break; case FTR_HIGHER_SAFE: @@ -252,7 +256,7 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) ftr = ftr_bits->safe_val; break; case FTR_LOWER_SAFE: - if (ftr > 0) + if (ftr > ftr_bits->safe_val) ftr--; break; case FTR_HIGHER_SAFE: @@ -276,7 +280,7 @@ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) { uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); - if (ftr_bits->type == FTR_UNSIGNED) { + if (ftr_bits->sign == FTR_UNSIGNED) { switch (ftr_bits->type) { case FTR_EXACT: ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1); diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index 5f9713364693..f2fb0e3f14bc 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -93,22 +93,6 @@ static inline void write_sel_evtyper(int sel, unsigned long val) isb(); } -static inline void enable_counter(int idx) -{ - uint64_t v = read_sysreg(pmcntenset_el0); - - write_sysreg(BIT(idx) | v, pmcntenset_el0); - isb(); -} - -static inline void disable_counter(int idx) -{ - uint64_t v = read_sysreg(pmcntenset_el0); - - write_sysreg(BIT(idx) | v, pmcntenclr_el0); - isb(); -} - static void pmu_disable_reset(void) { uint64_t pmcr = read_sysreg(pmcr_el0); @@ -195,11 +179,11 @@ struct pmc_accessor pmc_accessors[] = { \ if (set_expected) \ __GUEST_ASSERT((_tval & mask), \ - "tval: 0x%lx; mask: 0x%lx; set_expected: 0x%lx", \ + "tval: 0x%lx; mask: 0x%lx; set_expected: %u", \ _tval, mask, set_expected); \ else \ __GUEST_ASSERT(!(_tval & mask), \ - "tval: 0x%lx; mask: 0x%lx; set_expected: 0x%lx", \ + "tval: 0x%lx; mask: 0x%lx; set_expected: %u", \ _tval, mask, set_expected); \ } @@ -286,7 +270,7 @@ static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx) acc->write_typer(pmc_idx, write_data); read_data = acc->read_typer(pmc_idx); __GUEST_ASSERT(read_data == write_data, - "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data); /* @@ -297,14 +281,14 @@ static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx) /* The count value must be 0, as it is disabled and reset */ __GUEST_ASSERT(read_data == 0, - "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx", + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx", pmc_idx, PMC_ACC_TO_IDX(acc), read_data); write_data = read_data + pmc_idx + 0x12345; acc->write_cntr(pmc_idx, write_data); read_data = acc->read_cntr(pmc_idx); __GUEST_ASSERT(read_data == write_data, - "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data); } @@ -379,7 +363,7 @@ static void guest_code(uint64_t expected_pmcr_n) int i, pmc; __GUEST_ASSERT(expected_pmcr_n <= ARMV8_PMU_MAX_GENERAL_COUNTERS, - "Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%lx", + "Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%x", expected_pmcr_n, ARMV8_PMU_MAX_GENERAL_COUNTERS); pmcr = read_sysreg(pmcr_el0); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 054d20ec61b9..801671150314 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1150,10 +1150,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) &stat_fops_per_vm); } - ret = kvm_arch_create_vm_debugfs(kvm); - if (ret) - goto out_err; - + kvm_arch_create_vm_debugfs(kvm); return 0; out_err: kvm_destroy_vm_debugfs(kvm); @@ -1183,9 +1180,8 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so * a per-arch destroy interface is not needed. */ -int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) +void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { - return 0; } static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) |