summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm/hyp/nvhe
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm/hyp/nvhe')
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile17
-rw-r--r--arch/arm64/kvm/hyp/nvhe/clock.c68
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c140
-rw-r--r--arch/arm64/kvm/hyp/nvhe/events.c25
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c241
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S15
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S52
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c366
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S8
-rw-r--r--arch/arm64/kvm/hyp/nvhe/list_debug.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c1120
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c99
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c636
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c45
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c47
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stacktrace.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c104
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c67
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c10
-rw-r--r--arch/arm64/kvm/hyp/nvhe/trace.c311
21 files changed, 2799 insertions, 584 deletions
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index b43426a493df..62cdfbff7562 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -12,12 +12,12 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
ccflags-y += -fno-stack-protector \
-DDISABLE_BRANCH_PROFILING \
- $(DISABLE_STACKLEAK_PLUGIN)
+ $(DISABLE_KSTACK_ERASE)
hostprogs := gen-hyprel
HOST_EXTRACFLAGS += -I$(objtree)/include
-lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o tishift.o
lib-objs := $(addprefix ../../../lib/, $(lib-objs))
CFLAGS_switch.nvhe.o += -Wno-override-init
@@ -26,10 +26,15 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
- ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o ../vgic-v5-sr.o
+hyp-obj-y += ../../../kernel/smccc-call.o
hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
+hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o events.o
hyp-obj-y += $(lib-objs)
+# Path to simple_ring_buffer.c
+CFLAGS_trace.nvhe.o += -I$(srctree)/kernel/trace/
+
##
## Build rules for compiling nVHE hyp code
## Output of this folder is `kvm_nvhe.o`, a partially linked object
@@ -99,3 +104,9 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAG
# causes a build failure. Remove profile optimization flags.
KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
+
+ifeq ($(CONFIG_UBSAN_KVM_EL2),y)
+UBSAN_SANITIZE := y
+# Always use brk and not hooks
+ccflags-y += $(CFLAGS_UBSAN_TRAP)
+endif
diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c
new file mode 100644
index 000000000000..a7fc61976fd0
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/clock.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/clock.h>
+
+#include <asm/arch_timer.h>
+#include <asm/div64.h>
+
+static struct clock_data {
+ struct {
+ u32 mult;
+ u32 shift;
+ u64 epoch_ns;
+ u64 epoch_cyc;
+ u64 cyc_overflow64;
+ } data[2];
+ u64 cur;
+} trace_clock_data;
+
+static u64 __clock_mult_uint128(u64 cyc, u32 mult, u32 shift)
+{
+ __uint128_t ns = (__uint128_t)cyc * mult;
+
+ ns >>= shift;
+
+ return (u64)ns;
+}
+
+/* Does not guarantee no reader on the modified bank. */
+void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
+{
+ struct clock_data *clock = &trace_clock_data;
+ u64 bank = clock->cur ^ 1;
+
+ if (!mult || shift >= 64)
+ return;
+
+ clock->data[bank].mult = mult;
+ clock->data[bank].shift = shift;
+ clock->data[bank].epoch_ns = epoch_ns;
+ clock->data[bank].epoch_cyc = epoch_cyc;
+ clock->data[bank].cyc_overflow64 = ULONG_MAX / mult;
+
+ smp_store_release(&clock->cur, bank);
+}
+
+/* Use untrusted host data */
+u64 trace_clock(void)
+{
+ struct clock_data *clock = &trace_clock_data;
+ u64 bank = smp_load_acquire(&clock->cur);
+ u64 cyc, ns;
+
+ cyc = __arch_counter_get_cntvct() - clock->data[bank].epoch_cyc;
+
+ if (likely(cyc < clock->data[bank].cyc_overflow64)) {
+ ns = cyc * clock->data[bank].mult;
+ ns >>= clock->data[bank].shift;
+ } else {
+ ns = __clock_mult_uint128(cyc, clock->data[bank].mult,
+ clock->data[bank].shift);
+ }
+
+ return (u64)ns + clock->data[bank].epoch_ns;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 2f4a4f5036bb..f8904391c125 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -14,20 +14,20 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
-static void __debug_save_spe(u64 *pmscr_el1)
+static void __debug_save_spe(void)
{
- u64 reg;
+ u64 *pmscr_el1, *pmblimitr_el1;
- /* Clear pmscr in case of early return */
- *pmscr_el1 = 0;
+ pmscr_el1 = host_data_ptr(host_debug_state.pmscr_el1);
+ pmblimitr_el1 = host_data_ptr(host_debug_state.pmblimitr_el1);
/*
* At this point, we know that this CPU implements
* SPE and is available to the host.
* Check if the host is actually using it ?
*/
- reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
- if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT)))
+ *pmblimitr_el1 = read_sysreg_s(SYS_PMBLIMITR_EL1);
+ if (!(*pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT)))
return;
/* Yes; save the control register and disable data generation */
@@ -37,18 +37,29 @@ static void __debug_save_spe(u64 *pmscr_el1)
/* Now drain all buffered data to memory */
psb_csync();
+ dsb(nsh);
+
+ /* And disable the profiling buffer */
+ write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+ isb();
}
-static void __debug_restore_spe(u64 pmscr_el1)
+static void __debug_restore_spe(void)
{
- if (!pmscr_el1)
+ u64 pmblimitr_el1 = *host_data_ptr(host_debug_state.pmblimitr_el1);
+
+ if (!(pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT)))
return;
/* The host page table is installed, but not yet synchronised */
isb();
+ /* Re-enable the profiling buffer. */
+ write_sysreg_s(pmblimitr_el1, SYS_PMBLIMITR_EL1);
+ isb();
+
/* Re-enable data generation */
- write_sysreg_el1(pmscr_el1, SYS_PMSCR);
+ write_sysreg_el1(*host_data_ptr(host_debug_state.pmscr_el1), SYS_PMSCR);
}
static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr)
@@ -57,12 +68,54 @@ static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr)
write_sysreg_el1(new_trfcr, SYS_TRFCR);
}
-static bool __trace_needs_drain(void)
+static void __trace_drain_and_disable(void)
{
- if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE))
- return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E;
+ u64 *trblimitr_el1 = host_data_ptr(host_debug_state.trblimitr_el1);
+ bool needs_drain = is_protected_kvm_enabled() ?
+ host_data_test_flag(HAS_TRBE) :
+ host_data_test_flag(TRBE_ENABLED);
- return host_data_test_flag(TRBE_ENABLED);
+ if (!needs_drain) {
+ *trblimitr_el1 = 0;
+ return;
+ }
+
+ *trblimitr_el1 = read_sysreg_s(SYS_TRBLIMITR_EL1);
+ if (*trblimitr_el1 & TRBLIMITR_EL1_E) {
+ /*
+ * The host has enabled the Trace Buffer Unit so we have
+ * to beat the CPU with a stick until it stops accessing
+ * memory.
+ */
+
+ /* First, ensure that our prior write to TRFCR has stuck. */
+ isb();
+
+ /* Now synchronise with the trace and drain the buffer. */
+ tsb_csync();
+ dsb(nsh);
+
+ /*
+ * With no more trace being generated, we can disable the
+ * Trace Buffer Unit.
+ */
+ write_sysreg_s(0, SYS_TRBLIMITR_EL1);
+ if (cpus_have_final_cap(ARM64_WORKAROUND_2064142)) {
+ /*
+ * Some CPUs are so good, we have to drain 'em
+ * twice.
+ */
+ tsb_csync();
+ dsb(nsh);
+ }
+
+ /*
+ * Ensure that the Trace Buffer Unit is disabled before
+ * we start mucking with the stage-2 and trap
+ * configuration.
+ */
+ isb();
+ }
}
static bool __trace_needs_switch(void)
@@ -79,24 +132,69 @@ static void __trace_switch_to_guest(void)
__trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1),
*host_data_ptr(trfcr_while_in_guest));
-
- if (__trace_needs_drain()) {
- isb();
- tsb_csync();
- }
+ __trace_drain_and_disable();
}
static void __trace_switch_to_host(void)
{
+ u64 trblimitr_el1 = *host_data_ptr(host_debug_state.trblimitr_el1);
+
+ if (trblimitr_el1 & TRBLIMITR_EL1_E) {
+ /* Re-enable the Trace Buffer Unit for the host. */
+ write_sysreg_s(trblimitr_el1, SYS_TRBLIMITR_EL1);
+ isb();
+ if (cpus_have_final_cap(ARM64_WORKAROUND_2038923)) {
+ /*
+ * Make sure the unit is re-enabled before we
+ * poke TRFCR.
+ */
+ isb();
+ }
+ }
+
__trace_do_switch(host_data_ptr(trfcr_while_in_guest),
*host_data_ptr(host_debug_state.trfcr_el1));
}
+static void __debug_save_brbe(void)
+{
+ u64 *brbcr_el1 = host_data_ptr(host_debug_state.brbcr_el1);
+
+ *brbcr_el1 = 0;
+
+ /* Check if the BRBE is enabled */
+ if (!(read_sysreg_el1(SYS_BRBCR) & (BRBCR_ELx_E0BRE | BRBCR_ELx_ExBRE)))
+ return;
+
+ /*
+ * Prohibit branch record generation while we are in guest.
+ * Since access to BRBCR_EL1 is trapped, the guest can't
+ * modify the filtering set by the host.
+ */
+ *brbcr_el1 = read_sysreg_el1(SYS_BRBCR);
+ write_sysreg_el1(0, SYS_BRBCR);
+}
+
+static void __debug_restore_brbe(void)
+{
+ u64 brbcr_el1 = *host_data_ptr(host_debug_state.brbcr_el1);
+
+ if (!brbcr_el1)
+ return;
+
+ /* Restore BRBE controls */
+ write_sysreg_el1(brbcr_el1, SYS_BRBCR);
+}
+
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
if (host_data_test_flag(HAS_SPE))
- __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1));
+ __debug_save_spe();
+
+ /* Disable BRBE branch records */
+ if (host_data_test_flag(HAS_BRBE))
+ __debug_save_brbe();
if (__trace_needs_switch())
__trace_switch_to_guest();
@@ -110,7 +208,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
if (host_data_test_flag(HAS_SPE))
- __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1));
+ __debug_restore_spe();
+ if (host_data_test_flag(HAS_BRBE))
+ __debug_restore_brbe();
if (__trace_needs_switch())
__trace_switch_to_host();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/events.c b/arch/arm64/kvm/hyp/nvhe/events.c
new file mode 100644
index 000000000000..add9383aadb5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/events.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/mm.h>
+#include <nvhe/trace.h>
+
+#include <nvhe/define_events.h>
+
+int __tracing_enable_event(unsigned short id, bool enable)
+{
+ struct hyp_event_id *event_id = &__hyp_event_ids_start[id];
+ atomic_t *enabled;
+
+ if (event_id >= __hyp_event_ids_end)
+ return -EINVAL;
+
+ enabled = hyp_fixmap_map(__hyp_pa(&event_id->enabled));
+ atomic_set(enabled, enable);
+ hyp_fixmap_unmap();
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index e433dfab882a..1af722771178 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -26,10 +26,10 @@
* the duration and are therefore serialised.
*/
-#include <linux/arm-smccc.h>
#include <linux/arm_ffa.h>
#include <asm/kvm_pkvm.h>
+#include <nvhe/arm-smccc.h>
#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/memory.h>
@@ -71,36 +71,68 @@ static u32 hyp_ffa_version;
static bool has_version_negotiated;
static hyp_spinlock_t version_lock;
-static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+static void ffa_to_smccc_error(struct arm_smccc_1_2_regs *res, u64 ffa_errno)
{
- *res = (struct arm_smccc_res) {
+ *res = (struct arm_smccc_1_2_regs) {
.a0 = FFA_ERROR,
.a2 = ffa_errno,
};
}
-static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+static void ffa_to_smccc_res_prop(struct arm_smccc_1_2_regs *res, int ret, u64 prop)
{
if (ret == FFA_RET_SUCCESS) {
- *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
- .a2 = prop };
+ *res = (struct arm_smccc_1_2_regs) { .a0 = FFA_SUCCESS,
+ .a2 = prop };
} else {
ffa_to_smccc_error(res, ret);
}
}
-static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+static void ffa_to_smccc_res(struct arm_smccc_1_2_regs *res, int ret)
{
ffa_to_smccc_res_prop(res, ret, 0);
}
static void ffa_set_retval(struct kvm_cpu_context *ctxt,
- struct arm_smccc_res *res)
+ struct arm_smccc_1_2_regs *res)
{
cpu_reg(ctxt, 0) = res->a0;
cpu_reg(ctxt, 1) = res->a1;
cpu_reg(ctxt, 2) = res->a2;
cpu_reg(ctxt, 3) = res->a3;
+ cpu_reg(ctxt, 4) = res->a4;
+ cpu_reg(ctxt, 5) = res->a5;
+ cpu_reg(ctxt, 6) = res->a6;
+ cpu_reg(ctxt, 7) = res->a7;
+
+ /*
+ * DEN0028C 2.6: SMC32/HVC32 call from aarch64 must preserve x8-x30.
+ *
+ * In FF-A 1.2, we cannot rely on the function ID sent by the caller to
+ * detect 32-bit calls because the CPU cycle management interfaces (e.g.
+ * FFA_MSG_WAIT, FFA_RUN) are 32-bit only but can have 64-bit responses.
+ *
+ * FFA-1.3 introduces 64-bit variants of the CPU cycle management
+ * interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests
+ * complete with SMC32 direct responses which *should* allow us use the
+ * function ID sent by the caller to determine whether to return x8-x17.
+ *
+ * Note that we also cannot rely on function IDs in the response.
+ *
+ * Given the above, assume SMC64 and send back x0-x17 unconditionally
+ * as the passthrough code (__kvm_hyp_host_forward_smc) does the same.
+ */
+ cpu_reg(ctxt, 8) = res->a8;
+ cpu_reg(ctxt, 9) = res->a9;
+ cpu_reg(ctxt, 10) = res->a10;
+ cpu_reg(ctxt, 11) = res->a11;
+ cpu_reg(ctxt, 12) = res->a12;
+ cpu_reg(ctxt, 13) = res->a13;
+ cpu_reg(ctxt, 14) = res->a14;
+ cpu_reg(ctxt, 15) = res->a15;
+ cpu_reg(ctxt, 16) = res->a16;
+ cpu_reg(ctxt, 17) = res->a17;
}
static bool is_ffa_call(u64 func_id)
@@ -113,82 +145,92 @@ static bool is_ffa_call(u64 func_id)
static int ffa_map_hyp_buffers(u64 ffa_page_count)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
- arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
- hyp_virt_to_phys(hyp_buffers.tx),
- hyp_virt_to_phys(hyp_buffers.rx),
- ffa_page_count,
- 0, 0, 0, 0,
- &res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_FN64_RXTX_MAP,
+ .a1 = hyp_virt_to_phys(hyp_buffers.tx),
+ .a2 = hyp_virt_to_phys(hyp_buffers.rx),
+ .a3 = ffa_page_count,
+ }, &res);
return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
}
static int ffa_unmap_hyp_buffers(void)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
- arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
- HOST_FFA_ID,
- 0, 0, 0, 0, 0, 0,
- &res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_RXTX_UNMAP,
+ .a1 = HOST_FFA_ID,
+ }, &res);
return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
}
-static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 fraglen, u32 endpoint_id)
{
- arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
- handle_lo, handle_hi, fraglen, endpoint_id,
- 0, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_FRAG_TX,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = fraglen,
+ .a4 = endpoint_id,
+ }, res);
}
-static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 fragoff)
{
- arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
- handle_lo, handle_hi, fragoff, HOST_FFA_ID,
- 0, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_FRAG_RX,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = fragoff,
+ .a4 = HOST_FFA_ID,
+ }, res);
}
-static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
u32 fraglen)
{
- arm_smccc_1_1_smc(func_id, len, fraglen,
- 0, 0, 0, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = func_id,
+ .a1 = len,
+ .a2 = fraglen,
+ }, res);
}
-static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
u32 handle_hi, u32 flags)
{
- arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
- handle_lo, handle_hi, flags,
- 0, 0, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_MEM_RECLAIM,
+ .a1 = handle_lo,
+ .a2 = handle_hi,
+ .a3 = flags,
+ }, res);
}
-static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len)
+static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
{
- arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
- len, len,
- 0, 0, 0, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_FN64_MEM_RETRIEVE_REQ,
+ .a1 = len,
+ .a2 = len,
+ }, res);
}
-static void ffa_rx_release(struct arm_smccc_res *res)
+static void ffa_rx_release(struct arm_smccc_1_2_regs *res)
{
- arm_smccc_1_1_smc(FFA_RX_RELEASE,
- 0, 0,
- 0, 0, 0, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_RX_RELEASE,
+ }, res);
}
-static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+static void do_ffa_rxtx_map(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(phys_addr_t, tx, ctxt, 1);
@@ -267,7 +309,7 @@ err_unmap:
goto out_unlock;
}
-static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+static void do_ffa_rxtx_unmap(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, id, ctxt, 1);
@@ -368,7 +410,7 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
return ret;
}
-static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+static void do_ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, handle_lo, ctxt, 1);
@@ -427,7 +469,7 @@ out:
}
static void __do_ffa_mem_xfer(const u64 func_id,
- struct arm_smccc_res *res,
+ struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, len, ctxt, 1);
@@ -437,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id,
struct ffa_mem_region_attributes *ep_mem_access;
struct ffa_composite_mem_region *reg;
struct ffa_mem_region *buf;
- u32 offset, nr_ranges;
+ u32 offset, nr_ranges, checked_offset;
int ret = 0;
if (addr_mbz || npages_mbz || fraglen > len ||
@@ -474,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id,
goto out_unlock;
}
- if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < checked_offset) {
ret = FFA_RET_INVALID_PARAMETERS;
goto out_unlock;
}
@@ -521,7 +568,7 @@ err_unshare:
__do_ffa_mem_xfer((fid), (res), (ctxt)); \
} while (0);
-static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+static void do_ffa_mem_reclaim(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, handle_lo, ctxt, 1);
@@ -628,13 +675,26 @@ static bool ffa_call_supported(u64 func_id)
case FFA_RXTX_MAP:
case FFA_MEM_DONATE:
case FFA_MEM_RETRIEVE_REQ:
+ /* Optional notification interfaces added in FF-A 1.1 */
+ case FFA_NOTIFICATION_BITMAP_CREATE:
+ case FFA_NOTIFICATION_BITMAP_DESTROY:
+ case FFA_NOTIFICATION_BIND:
+ case FFA_NOTIFICATION_UNBIND:
+ case FFA_NOTIFICATION_SET:
+ case FFA_NOTIFICATION_GET:
+ case FFA_NOTIFICATION_INFO_GET:
+ /* Optional interfaces added in FF-A 1.2 */
+ case FFA_MSG_SEND_DIRECT_REQ2: /* Optional per 7.5.1 */
+ case FFA_MSG_SEND_DIRECT_RESP2: /* Optional per 7.5.1 */
+ case FFA_CONSOLE_LOG: /* Optional per 13.1: not in Table 13.1 */
+ case FFA_PARTITION_INFO_GET_REGS: /* Optional for virtual instances per 13.1 */
return false;
}
return true;
}
-static bool do_ffa_features(struct arm_smccc_res *res,
+static bool do_ffa_features(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, id, ctxt, 1);
@@ -666,21 +726,25 @@ out_handled:
static int hyp_ffa_post_init(void)
{
size_t min_rxtx_sz;
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
- arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ .a0 = FFA_ID_GET,
+ }, &res);
if (res.a0 != FFA_SUCCESS)
return -EOPNOTSUPP;
if (res.a2 != HOST_FFA_ID)
return -EINVAL;
- arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
- 0, 0, 0, 0, 0, 0, &res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+ .a0 = FFA_FEATURES,
+ .a1 = FFA_FN64_RXTX_MAP,
+ }, &res);
if (res.a0 != FFA_SUCCESS)
return -EOPNOTSUPP;
- switch (res.a2) {
+ switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) {
case FFA_FEAT_RXTX_MIN_SZ_4K:
min_rxtx_sz = SZ_4K;
break;
@@ -700,7 +764,7 @@ static int hyp_ffa_post_init(void)
return 0;
}
-static void do_ffa_version(struct arm_smccc_res *res,
+static void do_ffa_version(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, ffa_req_version, ctxt, 1);
@@ -712,7 +776,10 @@ static void do_ffa_version(struct arm_smccc_res *res,
hyp_spin_lock(&version_lock);
if (has_version_negotiated) {
- res->a0 = hyp_ffa_version;
+ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version))
+ res->a0 = FFA_RET_NOT_SUPPORTED;
+ else
+ res->a0 = hyp_ffa_version;
goto unlock;
}
@@ -721,26 +788,27 @@ static void do_ffa_version(struct arm_smccc_res *res,
* first if TEE supports it.
*/
if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
- arm_smccc_1_1_smc(FFA_VERSION, ffa_req_version, 0,
- 0, 0, 0, 0, 0,
- res);
- if (res->a0 == FFA_RET_NOT_SUPPORTED)
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_VERSION,
+ .a1 = ffa_req_version,
+ }, res);
+ if ((s32)res->a0 == FFA_RET_NOT_SUPPORTED)
goto unlock;
hyp_ffa_version = ffa_req_version;
}
- if (hyp_ffa_post_init())
+ if (hyp_ffa_post_init()) {
res->a0 = FFA_RET_NOT_SUPPORTED;
- else {
- has_version_negotiated = true;
+ } else {
+ smp_store_release(&has_version_negotiated, true);
res->a0 = hyp_ffa_version;
}
unlock:
hyp_spin_unlock(&version_lock);
}
-static void do_ffa_part_get(struct arm_smccc_res *res,
+static void do_ffa_part_get(struct arm_smccc_1_2_regs *res,
struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, uuid0, ctxt, 1);
@@ -756,9 +824,14 @@ static void do_ffa_part_get(struct arm_smccc_res *res,
goto out_unlock;
}
- arm_smccc_1_1_smc(FFA_PARTITION_INFO_GET, uuid0, uuid1,
- uuid2, uuid3, flags, 0, 0,
- res);
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_PARTITION_INFO_GET,
+ .a1 = uuid0,
+ .a2 = uuid1,
+ .a3 = uuid2,
+ .a4 = uuid3,
+ .a5 = flags,
+ }, res);
if (res->a0 != FFA_SUCCESS)
goto out_unlock;
@@ -791,7 +864,7 @@ out_unlock:
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
/*
* There's no way we can tell what a non-standard SMC call might
@@ -809,7 +882,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
if (!is_ffa_call(func_id))
return false;
- if (!has_version_negotiated && func_id != FFA_VERSION) {
+ if (func_id != FFA_VERSION &&
+ !smp_load_acquire(&has_version_negotiated)) {
ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS);
goto out_handled;
}
@@ -859,14 +933,17 @@ out_handled:
int hyp_ffa_init(void *pages)
{
- struct arm_smccc_res res;
+ struct arm_smccc_1_2_regs res;
void *tx, *rx;
if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
return 0;
- arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_1, 0, 0, 0, 0, 0, 0, &res);
- if (res.a0 == FFA_RET_NOT_SUPPORTED)
+ hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+ .a0 = FFA_VERSION,
+ .a1 = FFA_VERSION_1_2,
+ }, &res);
+ if ((s32)res.a0 == FFA_RET_NOT_SUPPORTED)
return 0;
/*
@@ -885,10 +962,10 @@ int hyp_ffa_init(void *pages)
if (FFA_MAJOR_VERSION(res.a0) != 1)
return -EOPNOTSUPP;
- if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_1))
+ if (FFA_MINOR_VERSION(res.a0) < FFA_MINOR_VERSION(FFA_VERSION_1_2))
hyp_ffa_version = res.a0;
else
- hyp_ffa_version = FFA_VERSION_1_1;
+ hyp_ffa_version = FFA_VERSION_1_2;
tx = pages;
pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 58f0cb2298cc..9393fe3ea6a1 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -120,12 +120,11 @@ SYM_FUNC_START(__hyp_do_panic)
mov x29, x0
-#ifdef CONFIG_NVHE_EL2_DEBUG
+#ifdef CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC
/* Ensure host stage-2 is disabled */
mrs x0, hcr_el2
bic x0, x0, #HCR_VM
- msr hcr_el2, x0
- isb
+ msr_hcr_el2 x0
tlbi vmalls12e1
dsb nsh
#endif
@@ -291,13 +290,3 @@ SYM_CODE_START(__kvm_hyp_host_forward_smc)
ret
SYM_CODE_END(__kvm_hyp_host_forward_smc)
-
-/*
- * kvm_host_psci_cpu_entry is called through br instruction, which requires
- * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external
- * functions, but bti c instead.
- */
-SYM_CODE_START(kvm_host_psci_cpu_entry)
- bti j
- b __kvm_host_psci_cpu_entry
-SYM_CODE_END(kvm_host_psci_cpu_entry)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index f8af11189572..89cb553be1e5 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -100,7 +100,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
msr mair_el2, x1
ldr x1, [x0, #NVHE_INIT_HCR_EL2]
- msr hcr_el2, x1
+ msr_hcr_el2 x1
mov x2, #HCR_E2H
and x2, x1, x2
@@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_PGD_PA]
phys_to_ttbr x2, x1
alternative_if ARM64_HAS_CNP
- orr x2, x2, #TTBR_CNP_BIT
+ orr x2, x2, #TTBRx_EL1_CnP
alternative_else_nop_endif
msr ttbr0_el2, x2
@@ -173,9 +173,8 @@ SYM_CODE_END(___kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START(kvm_hyp_cpu_entry)
- mov x1, #1 // is_cpu_on = true
+ ldr x29, =__kvm_host_psci_cpu_on_entry
b __kvm_hyp_init_cpu
-SYM_CODE_END(kvm_hyp_cpu_entry)
/*
* PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point
@@ -183,32 +182,17 @@ SYM_CODE_END(kvm_hyp_cpu_entry)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START(kvm_hyp_cpu_resume)
- mov x1, #0 // is_cpu_on = false
- b __kvm_hyp_init_cpu
-SYM_CODE_END(kvm_hyp_cpu_resume)
+ ldr x29, =__kvm_host_psci_cpu_resume_entry
-/*
- * Common code for CPU entry points. Initializes EL2 state and
- * installs the hypervisor before handing over to a C handler.
- *
- * x0: struct kvm_nvhe_init_params PA
- * x1: bool is_cpu_on
- */
-SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
+SYM_INNER_LABEL(__kvm_hyp_init_cpu, SYM_L_LOCAL)
mov x28, x0 // Stash arguments
- mov x29, x1
/* Check that the core was booted in EL2. */
mrs x0, CurrentEL
cmp x0, #CurrentEL_EL2
- b.eq 2f
-
- /* The core booted in EL1. KVM cannot be initialized on it. */
-1: wfe
- wfi
- b 1b
+ b.ne 1f
-2: msr SPsel, #1 // We want to use SP_EL{1,2}
+ msr SPsel, #1 // We want to use SP_EL2
init_el2_hcr 0
@@ -218,11 +202,16 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
mov x0, x28
bl ___kvm_hyp_init // Clobbers x0..x2
- /* Leave idmap. */
- mov x0, x29
- ldr x1, =kvm_host_psci_cpu_entry
- br x1
-SYM_CODE_END(__kvm_hyp_init_cpu)
+ /* Leave idmap -- using BLR is OK, LR is restored from host context */
+ blr x29
+
+ // The core booted in EL1, or the C code unexpectedly returned.
+ // Either way, KVM cannot be initialized on it.
+1: wfe
+ wfi
+ b 1b
+SYM_CODE_END(kvm_hyp_cpu_resume)
+SYM_CODE_END(kvm_hyp_cpu_entry)
SYM_CODE_START(__kvm_handle_stub_hvc)
/*
@@ -260,11 +249,6 @@ reset:
msr sctlr_el2, x5
isb
-alternative_if ARM64_KVM_PROTECTED_MODE
- mov_q x5, HCR_HOST_NVHE_FLAGS
- msr hcr_el2, x5
-alternative_else_nop_endif
-
/* Install stub vectors */
adr_l x5, __hyp_stub_vectors
msr vbar_el2, x5
@@ -296,7 +280,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
/* Install the new pgtables */
phys_to_ttbr x5, x0
alternative_if ARM64_HAS_CNP
- orr x5, x5, #TTBR_CNP_BIT
+ orr x5, x5, #TTBRx_EL1_CnP
alternative_else_nop_endif
msr ttbr0_el2, x5
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 2c37680d954c..06db299c37a8 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -12,12 +12,14 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
#include <asm/kvm_mmu.h>
#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
+#include <nvhe/trace.h>
#include <nvhe/trap_handler.h>
DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -26,7 +28,7 @@ void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu)
{
- __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+ __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR));
/*
* On saving/restoring guest sve state, always use the maximum VL for
* the guest. The layout of the data when saving the sve state depends
@@ -69,7 +71,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
if (!guest_owns_fp_regs())
return;
- cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
+ /*
+ * Traps have been disabled by __deactivate_cptr_traps(), but there
+ * hasn't necessarily been a context synchronization event yet.
+ */
isb();
if (vcpu_has_sve(vcpu))
@@ -79,7 +84,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm));
if (has_fpmr)
- __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR);
+ __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR));
if (system_supports_sve())
__hyp_sve_restore_host();
@@ -123,10 +128,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
- hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
- /* Limit guest vector length to the maximum supported by the host. */
- hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl);
-
hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE);
hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) &
@@ -137,6 +138,8 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+
+ hyp_vcpu->vcpu.arch.pid = host_vcpu->arch.pid;
}
static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -158,6 +161,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
+ host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr;
for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
}
@@ -169,9 +173,6 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
struct pkvm_hyp_vcpu *hyp_vcpu;
- if (!is_protected_kvm_enabled())
- return;
-
hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx);
if (!hyp_vcpu)
return;
@@ -180,17 +181,16 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
/* Propagate WFx trapping flags */
hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI);
hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI);
+ } else {
+ memcpy(&hyp_vcpu->vcpu.arch.fgt, hyp_vcpu->host_vcpu->arch.fgt,
+ sizeof(hyp_vcpu->vcpu.arch.fgt));
}
}
static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
{
- struct pkvm_hyp_vcpu *hyp_vcpu;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
- if (!is_protected_kvm_enabled())
- return;
-
- hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (hyp_vcpu)
pkvm_put_hyp_vcpu(hyp_vcpu);
}
@@ -245,17 +245,35 @@ static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
&host_vcpu->arch.pkvm_memcache);
}
-static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_host_donate_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, pfn, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
- DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
+ hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+ if (!hyp_vcpu || !pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ goto out;
+
+ ret = pkvm_refill_memcache(hyp_vcpu);
+ if (ret)
goto out;
+ ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu);
+out:
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, pfn, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ int ret = -EINVAL;
+
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -264,7 +282,7 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
if (ret)
goto out;
- ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot);
+ ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot);
out:
cpu_reg(host_ctxt, 1) = ret;
}
@@ -273,17 +291,15 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
- ret = __pkvm_host_unshare_guest(gfn, hyp_vm);
+ ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm);
put_pkvm_hyp_vm(hyp_vm);
out:
cpu_reg(host_ctxt, 1) = ret;
@@ -296,9 +312,6 @@ static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ct
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -312,17 +325,15 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
- ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm);
+ ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm);
put_pkvm_hyp_vm(hyp_vm);
out:
cpu_reg(host_ctxt, 1) = ret;
@@ -332,18 +343,16 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
- DECLARE_REG(bool, mkold, host_ctxt, 3);
+ DECLARE_REG(u64, nr_pages, host_ctxt, 3);
+ DECLARE_REG(bool, mkold, host_ctxt, 4);
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
goto out;
- ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm);
+ ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm);
put_pkvm_hyp_vm(hyp_vm);
out:
cpu_reg(host_ctxt, 1) = ret;
@@ -355,9 +364,6 @@ static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt)
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
- if (!is_protected_kvm_enabled())
- goto out;
-
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
goto out;
@@ -417,12 +423,8 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
- struct pkvm_hyp_vm *hyp_vm;
-
- if (!is_protected_kvm_enabled())
- return;
+ struct pkvm_hyp_vm *hyp_vm = get_np_pkvm_hyp_vm(handle);
- hyp_vm = get_np_pkvm_hyp_vm(handle);
if (!hyp_vm)
return;
@@ -461,11 +463,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
__vgic_v3_init_lrs();
}
-static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
- __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
}
static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
@@ -479,17 +481,15 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
DECLARE_REG(unsigned long, size, host_ctxt, 2);
- DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
- DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
- DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+ DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 3);
+ DECLARE_REG(u32, hyp_va_bits, host_ctxt, 4);
/*
* __pkvm_init() will return only if an error occurred, otherwise it
* will tail-call in __pkvm_init_finalise() which will have to deal
* with the host context directly.
*/
- cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
- hyp_va_bits);
+ cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, per_cpu_base, hyp_va_bits);
}
static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
@@ -543,6 +543,18 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
}
+static void handle___pkvm_reserve_vm(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __pkvm_reserve_vm();
+}
+
+static void handle___pkvm_unreserve_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ __pkvm_unreserve_vm(handle);
+}
+
static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
@@ -563,11 +575,115 @@ static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
}
-static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_vcpu_in_poison_fault(struct kvm_cpu_context *host_ctxt)
+{
+ int ret;
+ struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+
+ ret = hyp_vcpu ? __pkvm_vcpu_in_poison_fault(hyp_vcpu) : -EINVAL;
+ cpu_reg(host_ctxt, 1) = ret;
+}
+
+static void handle___pkvm_force_reclaim_guest_page(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_host_force_reclaim_page_guest(phys);
+}
+
+static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+ DECLARE_REG(u64, gfn, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, gfn);
+}
+
+static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle);
+}
+
+static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle);
+}
+
+static void handle___tracing_load(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
+ DECLARE_REG(size_t, desc_size, host_ctxt, 2);
- cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+ cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size);
+}
+
+static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt)
+{
+ __tracing_unload();
+}
+
+static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(bool, enable, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_enable(enable);
+}
+
+static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu);
+}
+
+static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u32, mult, host_ctxt, 1);
+ DECLARE_REG(u32, shift, host_ctxt, 2);
+ DECLARE_REG(u64, epoch_ns, host_ctxt, 3);
+ DECLARE_REG(u64, epoch_cyc, host_ctxt, 4);
+
+ __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc);
+}
+
+static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = __tracing_reset(cpu);
+}
+
+static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned short, id, host_ctxt, 1);
+ DECLARE_REG(bool, enable, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable);
+}
+
+static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, id, host_ctxt, 1);
+
+ trace_selftest(id);
+}
+
+static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v5_save_apr(kern_hyp_va(cpu_if));
+}
+
+static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1);
+
+ __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if));
}
typedef void (*hcall_t)(struct kvm_cpu_context *);
@@ -584,14 +700,6 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_get_gic_config),
HANDLE_FUNC(__pkvm_prot_finalize),
- HANDLE_FUNC(__pkvm_host_share_hyp),
- HANDLE_FUNC(__pkvm_host_unshare_hyp),
- HANDLE_FUNC(__pkvm_host_share_guest),
- HANDLE_FUNC(__pkvm_host_unshare_guest),
- HANDLE_FUNC(__pkvm_host_relax_perms_guest),
- HANDLE_FUNC(__pkvm_host_wrprotect_guest),
- HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
- HANDLE_FUNC(__pkvm_host_mkyoung_guest),
HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
@@ -601,11 +709,37 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
- HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
+ HANDLE_FUNC(__tracing_load),
+ HANDLE_FUNC(__tracing_unload),
+ HANDLE_FUNC(__tracing_enable),
+ HANDLE_FUNC(__tracing_swap_reader),
+ HANDLE_FUNC(__tracing_update_clock),
+ HANDLE_FUNC(__tracing_reset),
+ HANDLE_FUNC(__tracing_enable_event),
+ HANDLE_FUNC(__tracing_write_event),
+ HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
+ HANDLE_FUNC(__vgic_v5_save_apr),
+ HANDLE_FUNC(__vgic_v5_restore_vmcr_apr),
+
+ HANDLE_FUNC(__pkvm_host_share_hyp),
+ HANDLE_FUNC(__pkvm_host_unshare_hyp),
+ HANDLE_FUNC(__pkvm_host_donate_guest),
+ HANDLE_FUNC(__pkvm_host_share_guest),
+ HANDLE_FUNC(__pkvm_host_unshare_guest),
+ HANDLE_FUNC(__pkvm_host_relax_perms_guest),
+ HANDLE_FUNC(__pkvm_host_wrprotect_guest),
+ HANDLE_FUNC(__pkvm_host_test_clear_young_guest),
+ HANDLE_FUNC(__pkvm_host_mkyoung_guest),
+ HANDLE_FUNC(__pkvm_reserve_vm),
+ HANDLE_FUNC(__pkvm_unreserve_vm),
HANDLE_FUNC(__pkvm_init_vm),
HANDLE_FUNC(__pkvm_init_vcpu),
- HANDLE_FUNC(__pkvm_teardown_vm),
+ HANDLE_FUNC(__pkvm_vcpu_in_poison_fault),
+ HANDLE_FUNC(__pkvm_force_reclaim_guest_page),
+ HANDLE_FUNC(__pkvm_reclaim_dying_guest_page),
+ HANDLE_FUNC(__pkvm_start_teardown_vm),
+ HANDLE_FUNC(__pkvm_finalize_teardown_vm),
HANDLE_FUNC(__pkvm_vcpu_load),
HANDLE_FUNC(__pkvm_vcpu_put),
HANDLE_FUNC(__pkvm_tlb_flush_vmid),
@@ -614,9 +748,11 @@ static const hcall_t host_hcall[] = {
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(unsigned long, id, host_ctxt, 0);
- unsigned long hcall_min = 0;
+ unsigned long hcall_min = 0, hcall_max = __KVM_HOST_SMCCC_FUNC_MAX;
hcall_t hfn;
+ BUILD_BUG_ON(ARRAY_SIZE(host_hcall) != __KVM_HOST_SMCCC_FUNC_MAX);
+
/*
* If pKVM has been initialised then reject any calls to the
* early "privileged" hypercalls. Note that we cannot reject
@@ -626,13 +762,16 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
* basis. This is all fine, however, since __pkvm_prot_finalize
* returns -EPERM after the first call for a given CPU.
*/
- if (static_branch_unlikely(&kvm_protected_mode_initialized))
- hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;
+ if (static_branch_unlikely(&kvm_protected_mode_initialized)) {
+ hcall_min = __KVM_HOST_SMCCC_FUNC_MIN_PKVM;
+ } else {
+ hcall_max = __KVM_HOST_SMCCC_FUNC_PKVM_ONLY;
+ }
id &= ~ARM_SMCCC_CALL_HINTS;
id -= KVM_HOST_SMCCC_ID(0);
- if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall)))
+ if (unlikely(id < hcall_min || id >= hcall_max))
goto inval;
hfn = host_hcall[id];
@@ -649,15 +788,27 @@ inval:
static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
{
+ trace_hyp_exit(host_ctxt, HYP_REASON_SMC);
__kvm_hyp_host_forward_smc(host_ctxt);
+ trace_hyp_enter(host_ctxt, HYP_REASON_SMC);
}
static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, func_id, host_ctxt, 0);
+ u64 esr = read_sysreg_el2(SYS_ESR);
bool handled;
+ if (esr & ESR_ELx_xVC_IMM_MASK) {
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+ goto exit_skip_instr;
+ }
+
func_id &= ~ARM_SMCCC_CALL_HINTS;
+ if (upper_32_bits(func_id)) {
+ cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+ goto exit_skip_instr;
+ }
handled = kvm_host_psci_handler(host_ctxt, func_id);
if (!handled)
@@ -665,26 +816,109 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
if (!handled)
default_host_smc_handler(host_ctxt);
+exit_skip_instr:
/* SMC was trapped, move ELR past the current PC. */
kvm_skip_host_instr();
}
+void inject_host_exception(u64 esr)
+{
+ u64 sctlr, spsr_el1, spsr_el2, exc_offset = except_type_sync;
+ const u64 spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT |
+ PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT;
+
+ spsr_el1 = spsr_el2 = read_sysreg_el2(SYS_SPSR);
+ switch (spsr_el1 & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
+ case PSR_MODE_EL0t:
+ exc_offset += LOWER_EL_AArch64_VECTOR;
+ break;
+ case PSR_MODE_EL0t | PSR_MODE32_BIT:
+ exc_offset += LOWER_EL_AArch32_VECTOR;
+ break;
+ default:
+ exc_offset += CURRENT_EL_SP_ELx_VECTOR;
+ }
+
+ spsr_el2 &= spsr_mask;
+ spsr_el2 |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
+ PSR_MODE_EL1h;
+
+ sctlr = read_sysreg_el1(SYS_SCTLR);
+ if (!(sctlr & SCTLR_EL1_SPAN))
+ spsr_el2 |= PSR_PAN_BIT;
+
+ if (sctlr & SCTLR_ELx_DSSBS)
+ spsr_el2 |= PSR_SSBS_BIT;
+
+ if (system_supports_mte())
+ spsr_el2 |= PSR_TCO_BIT;
+
+ if (esr_fsc_is_translation_fault(esr))
+ write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR);
+
+ write_sysreg_el1(esr, SYS_ESR);
+ write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
+ write_sysreg_el1(spsr_el1, SYS_SPSR);
+ write_sysreg_el2(read_sysreg_el1(SYS_VBAR) + exc_offset, SYS_ELR);
+ write_sysreg_el2(spsr_el2, SYS_SPSR);
+}
+
+static void inject_host_undef64(void)
+{
+ inject_host_exception((ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_IL);
+}
+
+static bool handle_host_mte(u64 esr)
+{
+ switch (esr_sys64_to_sysreg(esr)) {
+ case SYS_RGSR_EL1:
+ case SYS_GCR_EL1:
+ case SYS_TFSR_EL1:
+ case SYS_TFSRE0_EL1:
+ /* If we're here for any reason other than MTE, it's a bug. */
+ if (read_sysreg(HCR_EL2) & HCR_ATA)
+ return false;
+ break;
+ case SYS_GMID_EL1:
+ /* If we're here for any reason other than MTE, it's a bug. */
+ if (!(read_sysreg(HCR_EL2) & HCR_TID5))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ inject_host_undef64();
+ return true;
+}
+
void handle_trap(struct kvm_cpu_context *host_ctxt)
{
u64 esr = read_sysreg_el2(SYS_ESR);
+
switch (ESR_ELx_EC(esr)) {
case ESR_ELx_EC_HVC64:
+ trace_hyp_enter(host_ctxt, HYP_REASON_HVC);
handle_host_hcall(host_ctxt);
break;
case ESR_ELx_EC_SMC64:
+ trace_hyp_enter(host_ctxt, HYP_REASON_SMC);
handle_host_smc(host_ctxt);
break;
case ESR_ELx_EC_IABT_LOW:
case ESR_ELx_EC_DABT_LOW:
+ trace_hyp_enter(host_ctxt, HYP_REASON_HOST_ABORT);
handle_host_mem_abort(host_ctxt);
break;
+ case ESR_ELx_EC_SYS64:
+ if (handle_host_mte(esr))
+ break;
+ fallthrough;
default:
BUG();
}
+
+ trace_hyp_exit(host_ctxt, HYP_REASON_ERET_HOST);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index f4562f417d3f..7a02837203d1 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -16,6 +16,12 @@ SECTIONS {
HYP_SECTION(.text)
HYP_SECTION(.data..ro_after_init)
HYP_SECTION(.rodata)
+#ifdef CONFIG_NVHE_EL2_TRACING
+ . = ALIGN(PAGE_SIZE);
+ BEGIN_HYP_SECTION(.event_ids)
+ *(SORT(.hyp.event_ids.*))
+ END_HYP_SECTION
+#endif
/*
* .hyp..data..percpu needs to be page aligned to maintain the same
@@ -25,5 +31,7 @@ SECTIONS {
BEGIN_HYP_SECTION(.data..percpu)
PERCPU_INPUT(L1_CACHE_BYTES)
END_HYP_SECTION
+
HYP_SECTION(.bss)
+ HYP_SECTION(.data)
}
diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c
index 46a2d4f2b3c6..baa6260f88dc 100644
--- a/arch/arm64/kvm/hyp/nvhe/list_debug.c
+++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c
@@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
bool corruption = unlikely(condition); \
if (corruption) { \
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
- BUG_ON(1); \
+ BUG(); \
} else \
WARN_ON(1); \
} \
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 19c3c631708c..25f04629014e 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -5,6 +5,7 @@
*/
#include <linux/kvm_host.h>
+
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
@@ -14,12 +15,14 @@
#include <hyp/fault.h>
+#include <nvhe/arm-smccc.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
-#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
+#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_AS_S1 | KVM_PGTABLE_S2_IDMAP)
struct host_mmu host_mmu;
@@ -28,6 +31,19 @@ static struct hyp_pool host_s2_pool;
static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
#define current_vm (*this_cpu_ptr(&__current_vm))
+static void pkvm_sme_dvmsync_fw_call(void)
+{
+ if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) {
+ struct arm_smccc_res res;
+
+ /*
+ * Ignore the return value. Probing for the workaround
+ * availability took place in init_hyp_mode().
+ */
+ hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res);
+ }
+}
+
static void guest_lock_component(struct pkvm_hyp_vm *vm)
{
hyp_spin_lock(&vm->lock);
@@ -60,6 +76,11 @@ static void hyp_unlock_component(void)
hyp_spin_unlock(&pkvm_pgd_lock);
}
+#define for_each_hyp_page(__p, __st, __sz) \
+ for (struct hyp_page *__p = hyp_phys_to_page(__st), \
+ *__e = __p + ((__sz) >> PAGE_SHIFT); \
+ __p < __e; __p++)
+
static void *host_s2_zalloc_pages_exact(size_t size)
{
void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size));
@@ -161,12 +182,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
return 0;
}
-static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
- enum kvm_pgtable_prot prot)
-{
- return true;
-}
-
static void *guest_s2_zalloc_pages_exact(size_t size)
{
void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
@@ -217,16 +232,42 @@ static void guest_s2_put_page(void *addr)
hyp_put_page(&current_vm->pool, addr);
}
+static void __apply_guest_page(void *va, size_t size,
+ void (*func)(void *addr, size_t size))
+{
+ size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE);
+ va = PTR_ALIGN_DOWN(va, PAGE_SIZE);
+ size = PAGE_ALIGN(size);
+
+ while (size) {
+ size_t map_size = PAGE_SIZE;
+ void *map;
+
+ if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE)
+ map = hyp_fixblock_map(__hyp_pa(va), &map_size);
+ else
+ map = hyp_fixmap_map(__hyp_pa(va));
+
+ func(map, map_size);
+
+ if (map_size == PMD_SIZE)
+ hyp_fixblock_unmap();
+ else
+ hyp_fixmap_unmap();
+
+ size -= map_size;
+ va += map_size;
+ }
+}
+
static void clean_dcache_guest_page(void *va, size_t size)
{
- __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
- hyp_fixmap_unmap();
+ __apply_guest_page(va, size, __clean_dcache_guest_page);
}
static void invalidate_icache_guest_page(void *va, size_t size)
{
- __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
- hyp_fixmap_unmap();
+ __apply_guest_page(va, size, __invalidate_icache_guest_page);
}
int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
@@ -255,8 +296,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
};
guest_lock_component(vm);
- ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
- guest_stage2_force_pte_cb);
+ ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL);
guest_unlock_component(vm);
if (ret)
return ret;
@@ -266,7 +306,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
return 0;
}
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
+void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
{
struct hyp_page *page;
void *addr;
@@ -300,6 +340,8 @@ int __pkvm_prot_finalize(void)
params->vttbr = kvm_get_vttbr(mmu);
params->vtcr = mmu->vtcr;
params->hcr_el2 |= HCR_VM;
+ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+ params->hcr_el2 |= HCR_FWB;
/*
* The CMO below not only cleans the updated params to the
@@ -309,7 +351,7 @@ int __pkvm_prot_finalize(void)
*/
kvm_flush_dcache_to_poc(params, sizeof(*params));
- write_sysreg(params->hcr_el2, hcr_el2);
+ write_sysreg_hcr(params->hcr_el2);
__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
/*
@@ -343,6 +385,19 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
+/*
+ * Ensure the PFN range is contained within PA-range.
+ *
+ * This check is also robust to overflows and is therefore a requirement before
+ * using a pfn/nr_pages pair from an untrusted source.
+ */
+static bool pfn_range_is_valid(u64 pfn, u64 nr_pages)
+{
+ u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT);
+
+ return pfn < limit && ((limit - pfn) >= nr_pages);
+}
+
struct kvm_mem_range {
u64 start;
u64 end;
@@ -422,8 +477,15 @@ static bool range_is_memory(u64 start, u64 end)
static inline int __host_stage2_idmap(u64 start, u64 end,
enum kvm_pgtable_prot prot)
{
+ /*
+ * We don't make permission changes to the host idmap after
+ * initialisation, so we can squash -EAGAIN to save callers
+ * having to treat it like success in the case that they try to
+ * map something that is already mapped.
+ */
return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
- prot, &host_s2_pool, 0);
+ prot, &host_s2_pool,
+ KVM_PGTABLE_WALK_IGNORE_EAGAIN);
}
/*
@@ -455,6 +517,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
{
struct kvm_mem_range cur;
kvm_pte_t pte;
+ u64 granule;
s8 level;
int ret;
@@ -464,25 +527,29 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
return ret;
if (kvm_pte_valid(pte))
- return -EAGAIN;
+ return -EEXIST;
if (pte) {
- WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE);
+ WARN_ON(addr_is_memory(addr) &&
+ get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE);
return -EPERM;
}
- do {
- u64 granule = kvm_granule_size(level);
+ for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) {
+ if (!kvm_level_supports_block_mapping(level))
+ continue;
+ granule = kvm_granule_size(level);
cur.start = ALIGN_DOWN(addr, granule);
cur.end = cur.start + granule;
- level++;
- } while ((level <= KVM_PGTABLE_LAST_LEVEL) &&
- !(kvm_level_supports_block_mapping(level) &&
- range_included(&cur, range)));
+ if (!range_included(&cur, range) && level < KVM_PGTABLE_LAST_LEVEL)
+ continue;
+ *range = cur;
+ return 0;
+ }
- *range = cur;
+ WARN_ON(1);
- return 0;
+ return -EINVAL;
}
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
@@ -493,30 +560,109 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
{
- phys_addr_t end = addr + size;
-
- for (; addr < end; addr += PAGE_SIZE)
- hyp_phys_to_page(addr)->host_state = state;
+ for_each_hyp_page(page, addr, size)
+ set_host_state(page, state);
}
-int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+#define KVM_HOST_DONATION_PTE_OWNER_MASK GENMASK(3, 1)
+#define KVM_HOST_DONATION_PTE_EXTRA_MASK GENMASK(59, 4)
+static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size,
+ u8 owner_id, u64 meta)
{
+ kvm_pte_t annotation;
int ret;
- if (!addr_is_memory(addr))
+ if (owner_id == PKVM_ID_HOST)
+ return -EINVAL;
+
+ if (!range_is_memory(addr, addr + size))
return -EPERM;
- ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
- addr, size, &host_s2_pool, owner_id);
- if (ret)
- return ret;
+ if (!FIELD_FIT(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id))
+ return -EINVAL;
- /* Don't forget to update the vmemmap tracking for the host */
- if (owner_id == PKVM_ID_HOST)
- __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
- else
+ if (!FIELD_FIT(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta))
+ return -EINVAL;
+
+ annotation = FIELD_PREP(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id) |
+ FIELD_PREP(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta);
+ ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
+ addr, size, &host_s2_pool,
+ KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation);
+ if (!ret) {
+ /*
+ * After stage2 maintenance has happened, but before the page
+ * owner has changed.
+ */
+ pkvm_sme_dvmsync_fw_call();
__host_update_page_state(addr, size, PKVM_NOPAGE);
+ }
+
+ return ret;
+}
+
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+{
+ int ret = -EINVAL;
+
+ switch (owner_id) {
+ case PKVM_ID_HOST:
+ if (!range_is_memory(addr, addr + size))
+ return -EPERM;
+
+ ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
+ if (!ret)
+ __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
+ break;
+ case PKVM_ID_HYP:
+ ret = host_stage2_set_owner_metadata_locked(addr, size,
+ owner_id, 0);
+ break;
+ }
+
+ return ret;
+}
+
+#define KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK GENMASK(15, 0)
+/* We need 40 bits for the GFN to cover a 52-bit IPA with 4k pages and LPA2 */
+#define KVM_HOST_PTE_OWNER_GUEST_GFN_MASK GENMASK(55, 16)
+static u64 host_stage2_encode_gfn_meta(struct pkvm_hyp_vm *vm, u64 gfn)
+{
+ pkvm_handle_t handle = vm->kvm.arch.pkvm.handle;
+
+ BUILD_BUG_ON((pkvm_handle_t)-1 > KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK);
+ WARN_ON(!FIELD_FIT(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn));
+
+ return FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, handle) |
+ FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn);
+}
+
+static int host_stage2_decode_gfn_meta(kvm_pte_t pte, struct pkvm_hyp_vm **vm,
+ u64 *gfn)
+{
+ pkvm_handle_t handle;
+ u64 meta;
+
+ if (WARN_ON(kvm_pte_valid(pte)))
+ return -EINVAL;
+
+ if (FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) !=
+ KVM_HOST_INVALID_PTE_TYPE_DONATION) {
+ return -EINVAL;
+ }
+
+ if (FIELD_GET(KVM_HOST_DONATION_PTE_OWNER_MASK, pte) != PKVM_ID_GUEST)
+ return -EPERM;
+ meta = FIELD_GET(KVM_HOST_DONATION_PTE_EXTRA_MASK, pte);
+ handle = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, meta);
+ *vm = get_vm_by_handle(handle);
+ if (!*vm) {
+ /* We probably raced with teardown; try again */
+ return -EAGAIN;
+ }
+
+ *gfn = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, meta);
return 0;
}
@@ -563,11 +709,43 @@ unlock:
return ret;
}
+static void host_inject_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+ u64 ec, esr, spsr;
+
+ esr = read_sysreg_el2(SYS_ESR);
+ spsr = read_sysreg_el2(SYS_SPSR);
+
+ /* Repaint the ESR to report a same-level fault if taken from EL1 */
+ if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+ ec = ESR_ELx_EC(esr);
+ if (ec == ESR_ELx_EC_DABT_LOW)
+ ec = ESR_ELx_EC_DABT_CUR;
+ else if (ec == ESR_ELx_EC_IABT_LOW)
+ ec = ESR_ELx_EC_IABT_CUR;
+ else
+ WARN_ON(1);
+ esr &= ~ESR_ELx_EC_MASK;
+ esr |= ec << ESR_ELx_EC_SHIFT;
+ }
+
+ /*
+ * Since S1PTW should only ever be set for stage-2 faults, we're pretty
+ * much guaranteed that it won't be set in ESR_EL1 by the hardware. So,
+ * let's use that bit to allow the host abort handler to differentiate
+ * this abort from normal userspace faults.
+ *
+ * Note: although S1PTW is RES0 at EL1, it is guaranteed by the
+ * architecture to be backed by flops, so it should be safe to use.
+ */
+ esr |= ESR_ELx_S1PTW;
+ inject_host_exception(esr);
+}
+
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
{
struct kvm_vcpu_fault_info fault;
u64 esr, addr;
- int ret = 0;
esr = read_sysreg_el2(SYS_ESR);
if (!__get_fault_info(esr, &fault)) {
@@ -578,9 +756,24 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
return;
}
- addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
- ret = host_stage2_idmap(addr);
- BUG_ON(ret && ret != -EAGAIN);
+
+ /*
+ * Yikes, we couldn't resolve the fault IPA. This should reinject an
+ * abort into the host when we figure out how to do that.
+ */
+ BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS));
+ addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12;
+
+ switch (host_stage2_idmap(addr)) {
+ case -EPERM:
+ host_inject_mem_abort(host_ctxt);
+ fallthrough;
+ case -EEXIST:
+ case 0:
+ break;
+ default:
+ BUG();
+ }
}
struct check_walk_data {
@@ -611,16 +804,16 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
static int __host_check_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- u64 end = addr + size;
int ret;
- ret = check_range_allowed_memory(addr, end);
+ ret = check_range_allowed_memory(addr, addr + size);
if (ret)
return ret;
hyp_assert_lock_held(&host_mmu.lock);
- for (; addr < end; addr += PAGE_SIZE) {
- if (hyp_phys_to_page(addr)->host_state != state)
+
+ for_each_hyp_page(page, addr, size) {
+ if (get_host_state(page) != state)
return -EPERM;
}
@@ -630,7 +823,7 @@ static int __host_check_page_state_range(u64 addr, u64 size,
static int __host_set_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) {
+ if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) {
int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
if (ret)
@@ -642,38 +835,45 @@ static int __host_set_page_state_range(u64 addr, u64 size,
return 0;
}
-static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr)
+static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state)
{
- if (!kvm_pte_valid(pte))
- return PKVM_NOPAGE;
+ for_each_hyp_page(page, phys, size)
+ set_hyp_state(page, state);
+}
- return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
+static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state)
+{
+ for_each_hyp_page(page, phys, size) {
+ if (get_hyp_state(page) != state)
+ return -EPERM;
+ }
+
+ return 0;
}
-static int __hyp_check_page_state_range(u64 addr, u64 size,
- enum pkvm_page_state state)
+static bool guest_pte_is_poisoned(kvm_pte_t pte)
{
- struct check_walk_data d = {
- .desired = state,
- .get_page_state = hyp_get_page_state,
- };
+ if (kvm_pte_valid(pte))
+ return false;
- hyp_assert_lock_held(&pkvm_pgd_lock);
- return check_page_state_range(&pkvm_pgtable, addr, size, &d);
+ return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) ==
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED;
}
static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
{
+ if (guest_pte_is_poisoned(pte))
+ return PKVM_POISON;
+
if (!kvm_pte_valid(pte))
return PKVM_NOPAGE;
return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
}
-static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,
+static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr,
u64 size, enum pkvm_page_state state)
{
- struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
struct check_walk_data d = {
.desired = state,
.get_page_state = guest_get_page_state,
@@ -683,11 +883,80 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,
return check_page_state_range(&vm->pgt, addr, size, &d);
}
+static int get_valid_guest_pte(struct pkvm_hyp_vm *vm, u64 ipa, kvm_pte_t *ptep, u64 *physp)
+{
+ kvm_pte_t pte;
+ u64 phys;
+ s8 level;
+ int ret;
+
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+ if (ret)
+ return ret;
+ if (guest_pte_is_poisoned(pte))
+ return -EHWPOISON;
+ if (!kvm_pte_valid(pte))
+ return -ENOENT;
+ if (level != KVM_PGTABLE_LAST_LEVEL)
+ return -E2BIG;
+
+ phys = kvm_pte_to_phys(pte);
+ ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
+ if (WARN_ON(ret))
+ return ret;
+
+ *ptep = pte;
+ *physp = phys;
+
+ return 0;
+}
+
+int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+ kvm_pte_t pte;
+ s8 level;
+ u64 ipa;
+ int ret;
+
+ switch (kvm_vcpu_trap_get_class(&hyp_vcpu->vcpu)) {
+ case ESR_ELx_EC_DABT_LOW:
+ case ESR_ELx_EC_IABT_LOW:
+ if (kvm_vcpu_trap_is_translation_fault(&hyp_vcpu->vcpu))
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * The host has the faulting IPA when it calls us from the guest
+ * fault handler but we retrieve it ourselves from the FAR so as
+ * to avoid exposing an "oracle" that could reveal data access
+ * patterns of the guest after initial donation of its pages.
+ */
+ ipa = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu);
+ ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(&hyp_vcpu->vcpu));
+
+ guest_lock_component(vm);
+ ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+ if (ret)
+ goto unlock;
+
+ if (level != KVM_PGTABLE_LAST_LEVEL) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = guest_pte_is_poisoned(pte);
+unlock:
+ guest_unlock_component(vm);
+ return ret;
+}
+
int __pkvm_host_share_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
- void *virt = __hyp_va(phys);
- enum kvm_pgtable_prot prot;
u64 size = PAGE_SIZE;
int ret;
@@ -697,14 +966,11 @@ int __pkvm_host_share_hyp(u64 pfn)
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (ret)
goto unlock;
- if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
- ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE);
- if (ret)
- goto unlock;
- }
+ ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
- prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
- WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot));
+ __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED));
unlock:
@@ -714,6 +980,72 @@ unlock:
return ret;
}
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys, ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ ret = -EPERM;
+ if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_OWNED)
+ goto unlock;
+ if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE))
+ goto unlock;
+
+ ret = 0;
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_SHARED_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+ WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 meta, phys, ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
+ if (ret)
+ goto unlock;
+
+ ret = -EPERM;
+ if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_SHARED_OWNED)
+ goto unlock;
+ if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED))
+ goto unlock;
+
+ ret = 0;
+ meta = host_stage2_encode_gfn_meta(vm, gfn);
+ WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
+ PKVM_ID_GUEST, meta));
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
int __pkvm_host_unshare_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
@@ -727,7 +1059,7 @@ int __pkvm_host_unshare_hyp(u64 pfn)
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (ret)
goto unlock;
- ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED);
+ ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
if (ret)
goto unlock;
if (hyp_page_count((void *)virt)) {
@@ -735,7 +1067,7 @@ int __pkvm_host_unshare_hyp(u64 pfn)
goto unlock;
}
- WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
+ __hyp_set_page_state_range(phys, size, PKVM_NOPAGE);
WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED));
unlock:
@@ -750,23 +1082,23 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
u64 phys = hyp_pfn_to_phys(pfn);
u64 size = PAGE_SIZE * nr_pages;
void *virt = __hyp_va(phys);
- enum kvm_pgtable_prot prot;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (ret)
goto unlock;
- if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
- ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE);
- if (ret)
- goto unlock;
- }
+ ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
- prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
- WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot));
+ __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP));
WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP));
unlock:
@@ -783,18 +1115,20 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
u64 virt = (u64)__hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
- ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED);
+ ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+ ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE);
if (ret)
goto unlock;
- if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
- ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE);
- if (ret)
- goto unlock;
- }
+ __hyp_set_page_state_range(phys, size, PKVM_NOPAGE);
WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST));
@@ -809,24 +1143,30 @@ int hyp_pin_shared_mem(void *from, void *to)
{
u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
u64 end = PAGE_ALIGN((u64)to);
+ u64 phys = __hyp_pa(start);
u64 size = end - start;
+ struct hyp_page *p;
int ret;
host_lock_component();
hyp_lock_component();
- ret = __host_check_page_state_range(__hyp_pa(start), size,
- PKVM_PAGE_SHARED_OWNED);
+ ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (ret)
goto unlock;
- ret = __hyp_check_page_state_range(start, size,
- PKVM_PAGE_SHARED_BORROWED);
+ ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
if (ret)
goto unlock;
- for (cur = start; cur < end; cur += PAGE_SIZE)
- hyp_page_ref_inc(hyp_virt_to_page(cur));
+ for (cur = start; cur < end; cur += PAGE_SIZE) {
+ p = hyp_virt_to_page(cur);
+ hyp_page_ref_inc(p);
+ if (p->refcount == 1)
+ WARN_ON(pkvm_create_mappings_locked((void *)cur,
+ (void *)cur + PAGE_SIZE,
+ PAGE_HYP));
+ }
unlock:
hyp_unlock_component();
@@ -839,12 +1179,17 @@ void hyp_unpin_shared_mem(void *from, void *to)
{
u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
u64 end = PAGE_ALIGN((u64)to);
+ struct hyp_page *p;
host_lock_component();
hyp_lock_component();
- for (cur = start; cur < end; cur += PAGE_SIZE)
- hyp_page_ref_dec(hyp_virt_to_page(cur));
+ for (cur = start; cur < end; cur += PAGE_SIZE) {
+ p = hyp_virt_to_page(cur);
+ if (p->refcount == 1)
+ WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE);
+ hyp_page_ref_dec(p);
+ }
hyp_unlock_component();
host_unlock_component();
@@ -856,6 +1201,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (!ret)
@@ -871,6 +1219,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (!ret)
@@ -880,49 +1231,281 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
return ret;
}
-int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu,
- enum kvm_pgtable_prot prot)
+static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size)
{
- struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
- u64 phys = hyp_pfn_to_phys(pfn);
- u64 ipa = hyp_pfn_to_phys(gfn);
- struct hyp_page *page;
- int ret;
+ size_t block_size;
- if (prot & ~KVM_PGTABLE_PROT_RWX)
+ if (nr_pages == 1) {
+ *size = PAGE_SIZE;
+ return 0;
+ }
+
+ /* We solely support second to last level huge mapping */
+ block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1);
+
+ if (nr_pages != block_size >> PAGE_SHIFT)
return -EINVAL;
- ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
+ if (!IS_ALIGNED(phys | ipa, block_size))
+ return -EINVAL;
+
+ *size = block_size;
+ return 0;
+}
+
+static void hyp_poison_page(phys_addr_t phys)
+{
+ void *addr = hyp_fixmap_map(phys);
+
+ memset(addr, 0, PAGE_SIZE);
+ /*
+ * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page()
+ * here as the latter may elide the CMO under the assumption that FWB
+ * will be enabled on CPUs that support it. This is incorrect for the
+ * host stage-2 and would otherwise lead to a malicious host potentially
+ * being able to read the contents of newly reclaimed guest pages.
+ */
+ kvm_flush_dcache_to_poc(addr, PAGE_SIZE);
+ hyp_fixmap_unmap();
+}
+
+static int host_stage2_get_guest_info(phys_addr_t phys, struct pkvm_hyp_vm **vm,
+ u64 *gfn)
+{
+ enum pkvm_page_state state;
+ kvm_pte_t pte;
+ s8 level;
+ int ret;
+
+ if (!addr_is_memory(phys))
+ return -EFAULT;
+
+ state = get_host_state(hyp_phys_to_page(phys));
+ switch (state) {
+ case PKVM_PAGE_OWNED:
+ case PKVM_PAGE_SHARED_OWNED:
+ case PKVM_PAGE_SHARED_BORROWED:
+ /* The access should no longer fault; try again. */
+ return -EAGAIN;
+ case PKVM_NOPAGE:
+ break;
+ default:
+ return -EPERM;
+ }
+
+ ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level);
if (ret)
return ret;
+ if (WARN_ON(level != KVM_PGTABLE_LAST_LEVEL))
+ return -EINVAL;
+
+ return host_stage2_decode_gfn_meta(pte, vm, gfn);
+}
+
+int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys)
+{
+ struct pkvm_hyp_vm *vm;
+ u64 gfn, ipa, pa;
+ kvm_pte_t pte;
+ int ret;
+
+ phys &= PAGE_MASK;
+
+ hyp_spin_lock(&vm_table_lock);
host_lock_component();
+
+ ret = host_stage2_get_guest_info(phys, &vm, &gfn);
+ if (ret)
+ goto unlock_host;
+
+ ipa = hyp_pfn_to_phys(gfn);
guest_lock_component(vm);
+ ret = get_valid_guest_pte(vm, ipa, &pte, &pa);
+ if (ret)
+ goto unlock_guest;
- ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE);
+ WARN_ON(pa != phys);
+ if (guest_get_page_state(pte, ipa) != PKVM_PAGE_OWNED) {
+ ret = -EPERM;
+ goto unlock_guest;
+ }
+
+ /* We really shouldn't be allocating, so don't pass a memcache */
+ ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, NULL,
+ KVM_GUEST_INVALID_PTE_TYPE_POISONED,
+ 0);
+ if (ret)
+ goto unlock_guest;
+
+ hyp_poison_page(phys);
+ WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+unlock_guest:
+ guest_unlock_component(vm);
+unlock_host:
+ host_unlock_component();
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
+}
+
+int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm)
+{
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ kvm_pte_t pte;
+ u64 phys;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = get_valid_guest_pte(vm, ipa, &pte, &phys);
if (ret)
goto unlock;
- page = hyp_phys_to_page(phys);
- switch (page->host_state) {
+ switch (guest_get_page_state(pte, ipa)) {
case PKVM_PAGE_OWNED:
- WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED));
+ WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE));
+ hyp_poison_page(phys);
break;
case PKVM_PAGE_SHARED_OWNED:
- if (page->host_share_guest_count)
- break;
- /* Only host to np-guest multi-sharing is tolerated */
- WARN_ON(1);
- fallthrough;
+ WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+ break;
default:
ret = -EPERM;
goto unlock;
}
+ WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE));
+ WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ /*
+ * -EHWPOISON implies that the page was forcefully reclaimed already
+ * so return success for the GUP pin to be dropped.
+ */
+ return ret && ret != -EHWPOISON ? ret : 0;
+}
+
+/*
+ * share/donate install at most one stage-2 leaf (PAGE_SIZE, or one
+ * KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages()
+ * bounds the worst-case allocation: exact for the PAGE_SIZE leaf,
+ * conservative by one for the block.
+ */
+static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+
+ if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu))
+ return -ENOMEM;
+
+ return 0;
+}
+
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 meta;
+ int ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = __host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED);
+ if (ret)
+ goto unlock;
+
+ ret = __guest_check_page_state_range(vm, ipa, PAGE_SIZE, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+
+ ret = __guest_check_pgtable_memcache(vcpu);
+ if (ret)
+ goto unlock;
+
+ meta = host_stage2_encode_gfn_meta(vm, gfn);
+ WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE,
+ PKVM_ID_GUEST, meta));
WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
+ pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED),
+ &vcpu->vcpu.arch.pkvm_memcache, 0));
+
+unlock:
+ guest_unlock_component(vm);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
+ enum kvm_pgtable_prot prot)
+{
+ struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+ u64 phys = hyp_pfn_to_phys(pfn);
+ u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 size;
+ int ret;
+
+ if (prot & ~KVM_PGTABLE_PROT_RWX)
+ return -EINVAL;
+
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
+ ret = __guest_check_transition_size(phys, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
+
+ ret = check_range_allowed_memory(phys, phys + size);
+ if (ret)
+ return ret;
+
+ host_lock_component();
+ guest_lock_component(vm);
+
+ ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE);
+ if (ret)
+ goto unlock;
+
+ for_each_hyp_page(page, phys, size) {
+ switch (get_host_state(page)) {
+ case PKVM_PAGE_OWNED:
+ continue;
+ case PKVM_PAGE_SHARED_OWNED:
+ if (page->host_share_guest_count == U32_MAX) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ /* Only host to np-guest multi-sharing is tolerated */
+ if (page->host_share_guest_count)
+ continue;
+
+ fallthrough;
+ default:
+ ret = -EPERM;
+ goto unlock;
+ }
+ }
+
+ ret = __guest_check_pgtable_memcache(vcpu);
+ if (ret)
+ goto unlock;
+
+ for_each_hyp_page(page, phys, size) {
+ set_host_state(page, PKVM_PAGE_SHARED_OWNED);
+ page->host_share_guest_count++;
+ }
+
+ WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys,
pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED),
&vcpu->vcpu.arch.pkvm_memcache, 0));
- page->host_share_guest_count++;
unlock:
guest_unlock_component(vm);
@@ -931,10 +1514,9 @@ unlock:
return ret;
}
-static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa)
+static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size)
{
enum pkvm_page_state state;
- struct hyp_page *page;
kvm_pte_t pte;
u64 phys;
s8 level;
@@ -945,51 +1527,60 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip
return ret;
if (!kvm_pte_valid(pte))
return -ENOENT;
- if (level != KVM_PGTABLE_LAST_LEVEL)
+ if (size && kvm_granule_size(level) != size)
return -E2BIG;
+ if (!size)
+ size = kvm_granule_size(level);
+
state = guest_get_page_state(pte, ipa);
if (state != PKVM_PAGE_SHARED_BORROWED)
return -EPERM;
phys = kvm_pte_to_phys(pte);
- ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
+ ret = check_range_allowed_memory(phys, phys + size);
if (WARN_ON(ret))
return ret;
- page = hyp_phys_to_page(phys);
- if (page->host_state != PKVM_PAGE_SHARED_OWNED)
- return -EPERM;
- if (WARN_ON(!page->host_share_guest_count))
- return -EINVAL;
+ for_each_hyp_page(page, phys, size) {
+ if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED)
+ return -EPERM;
+ if (WARN_ON(!page->host_share_guest_count))
+ return -EINVAL;
+ }
*__phys = phys;
return 0;
}
-int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm)
+int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm)
{
u64 ipa = hyp_pfn_to_phys(gfn);
- struct hyp_page *page;
- u64 phys;
+ u64 size, phys;
int ret;
+ ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
+
host_lock_component();
guest_lock_component(vm);
- ret = __check_host_shared_guest(vm, &phys, ipa);
+ ret = __check_host_shared_guest(vm, &phys, ipa, size);
if (ret)
goto unlock;
- ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
+ ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size);
if (ret)
goto unlock;
- page = hyp_phys_to_page(phys);
- page->host_share_guest_count--;
- if (!page->host_share_guest_count)
- WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED));
+ for_each_hyp_page(page, phys, size) {
+ /* __check_host_shared_guest() protects against underflow */
+ page->host_share_guest_count--;
+ if (!page->host_share_guest_count)
+ set_host_state(page, PKVM_PAGE_OWNED);
+ }
unlock:
guest_unlock_component(vm);
@@ -998,7 +1589,7 @@ unlock:
return ret;
}
-static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa)
+static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size)
{
u64 phys;
int ret;
@@ -1009,7 +1600,7 @@ static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa)
host_lock_component();
guest_lock_component(vm);
- ret = __check_host_shared_guest(vm, &phys, ipa);
+ ret = __check_host_shared_guest(vm, &phys, ipa, size);
guest_unlock_component(vm);
host_unlock_component();
@@ -1029,7 +1620,7 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
- assert_host_shared_guest(vm, ipa);
+ assert_host_shared_guest(vm, ipa, 0);
guest_lock_component(vm);
ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0);
guest_unlock_component(vm);
@@ -1037,33 +1628,41 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_
return ret;
}
-int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm)
+int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm)
{
- u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 size, ipa = hyp_pfn_to_phys(gfn);
int ret;
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
- assert_host_shared_guest(vm, ipa);
+ ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
+
+ assert_host_shared_guest(vm, ipa, size);
guest_lock_component(vm);
- ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE);
+ ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size);
guest_unlock_component(vm);
return ret;
}
-int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm)
+int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm)
{
- u64 ipa = hyp_pfn_to_phys(gfn);
+ u64 size, ipa = hyp_pfn_to_phys(gfn);
int ret;
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
- assert_host_shared_guest(vm, ipa);
+ ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
+ if (ret)
+ return ret;
+
+ assert_host_shared_guest(vm, ipa, size);
guest_lock_component(vm);
- ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold);
+ ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold);
guest_unlock_component(vm);
return ret;
@@ -1077,10 +1676,241 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu)
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
- assert_host_shared_guest(vm, ipa);
+ assert_host_shared_guest(vm, ipa, 0);
guest_lock_component(vm);
kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0);
guest_unlock_component(vm);
return 0;
}
+
+#ifdef CONFIG_NVHE_EL2_DEBUG
+struct pkvm_expected_state {
+ enum pkvm_page_state host;
+ enum pkvm_page_state hyp;
+ enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */
+};
+
+static struct pkvm_expected_state selftest_state;
+static struct hyp_page *selftest_page;
+static struct pkvm_hyp_vcpu *selftest_vcpu;
+
+static u64 selftest_ipa(void)
+{
+ return BIT(selftest_vcpu->vcpu.arch.hw_mmu->pgt->ia_bits - 1);
+}
+
+static void assert_page_state(void)
+{
+ void *virt = hyp_page_to_virt(selftest_page);
+ u64 size = PAGE_SIZE << selftest_page->order;
+ struct pkvm_hyp_vcpu *vcpu = selftest_vcpu;
+ u64 phys = hyp_virt_to_phys(virt);
+ u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE };
+ struct pkvm_hyp_vm *vm;
+
+ vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+
+ host_lock_component();
+ WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host));
+ host_unlock_component();
+
+ hyp_lock_component();
+ WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp));
+ hyp_unlock_component();
+
+ guest_lock_component(vm);
+ WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0]));
+ WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1]));
+ guest_unlock_component(vm);
+}
+
+#define assert_transition_res(res, fn, ...) \
+ do { \
+ WARN_ON(fn(__VA_ARGS__) != res); \
+ assert_page_state(); \
+ } while (0)
+
+void pkvm_ownership_selftest(void *base)
+{
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
+ void *virt = hyp_alloc_pages(&host_s2_pool, 0);
+ struct pkvm_hyp_vcpu *vcpu;
+ u64 phys, size, pfn, gfn;
+ struct pkvm_hyp_vm *vm;
+
+ WARN_ON(!virt);
+ selftest_page = hyp_virt_to_page(virt);
+ selftest_page->refcount = 0;
+ selftest_vcpu = vcpu = init_selftest_vm(base);
+ vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+
+ size = PAGE_SIZE << selftest_page->order;
+ phys = hyp_virt_to_phys(virt);
+ pfn = hyp_phys_to_pfn(phys);
+ gfn = hyp_phys_to_pfn(selftest_ipa());
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.hyp = PKVM_PAGE_OWNED;
+ selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE;
+ assert_page_state();
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+
+ selftest_state.host = PKVM_PAGE_SHARED_OWNED;
+ selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED;
+ assert_transition_res(0, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+
+ assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
+ assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
+ hyp_unpin_shared_mem(virt, virt + size);
+ WARN_ON(hyp_page_count(virt) != 1);
+ assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+
+ hyp_unpin_shared_mem(virt, virt + size);
+ assert_page_state();
+ WARN_ON(hyp_page_count(virt));
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_unshare_hyp, pfn);
+
+ selftest_state.host = PKVM_PAGE_SHARED_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.hyp = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_SHARED_OWNED;
+ selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED;
+ assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
+
+ selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED;
+ assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2);
+
+ selftest_state.guest[0] = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm);
+
+ selftest_state.guest[1] = PKVM_NOPAGE;
+ selftest_state.host = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[0] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_SHARED_BORROWED;
+ selftest_state.guest[0] = PKVM_PAGE_SHARED_OWNED;
+ assert_transition_res(0, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[0] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_guest_unshare_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_guest_unshare_host, vcpu, gfn);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
+ assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
+ assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
+ assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.guest[0] = PKVM_POISON;
+ assert_transition_res(0, __pkvm_host_force_reclaim_page_guest, phys);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+ assert_transition_res(-EHWPOISON, __pkvm_guest_share_host, vcpu, gfn);
+ assert_transition_res(-EHWPOISON, __pkvm_guest_unshare_host, vcpu, gfn);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.guest[1] = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu);
+
+ selftest_state.host = PKVM_PAGE_OWNED;
+ selftest_state.guest[1] = PKVM_NOPAGE;
+ assert_transition_res(0, __pkvm_host_reclaim_page_guest, gfn + 1, vm);
+ assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu);
+ assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
+
+ selftest_state.host = PKVM_NOPAGE;
+ selftest_state.hyp = PKVM_PAGE_OWNED;
+ assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1);
+
+ teardown_selftest_vm();
+ selftest_page->refcount = 1;
+ hyp_put_page(&host_s2_pool, virt);
+}
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index f41c7440b34b..3b0bee496bff 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -229,9 +229,8 @@ int hyp_map_vectors(void)
return 0;
}
-void *hyp_fixmap_map(phys_addr_t phys)
+static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys)
{
- struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
kvm_pte_t pte, *ptep = slot->ptep;
pte = *ptep;
@@ -243,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys)
return (void *)slot->addr;
}
+void *hyp_fixmap_map(phys_addr_t phys)
+{
+ return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys) + offset_in_page(phys);
+}
+
static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
{
kvm_pte_t *ptep = slot->ptep;
u64 addr = slot->addr;
+ u32 level;
+
+ if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE)
+ level = KVM_PGTABLE_LAST_LEVEL;
+ else
+ level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */
WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
@@ -260,8 +270,8 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
* https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
*/
dsb(ishst);
- __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL);
- dsb(ish);
+ __tlbi_level(vale2is, addr, level);
+ __tlbi_sync_s1ish_hyp();
isb();
}
@@ -273,9 +283,9 @@ void hyp_fixmap_unmap(void)
static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
- struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
+ struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg;
- if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL)
+ if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level))
return -EINVAL;
slot->addr = ctx->addr;
@@ -296,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu)
struct kvm_pgtable_walker walker = {
.cb = __create_fixmap_slot_cb,
.flags = KVM_PGTABLE_WALK_LEAF,
- .arg = (void *)cpu,
+ .arg = per_cpu_ptr(&fixmap_slots, cpu),
};
return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
}
-int hyp_create_pcpu_fixmap(void)
+#if PAGE_SHIFT < 16
+#define HAS_FIXBLOCK
+static struct hyp_fixmap_slot hyp_fixblock_slot;
+static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock);
+#endif
+
+static int create_fixblock(void)
+{
+#ifdef HAS_FIXBLOCK
+ struct kvm_pgtable_walker walker = {
+ .cb = __create_fixmap_slot_cb,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &hyp_fixblock_slot,
+ };
+ unsigned long addr;
+ phys_addr_t phys;
+ int ret, i;
+
+ /* Find a RAM phys address, PMD aligned */
+ for (i = 0; i < hyp_memblock_nr; i++) {
+ phys = ALIGN(hyp_memory[i].base, PMD_SIZE);
+ if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size))
+ break;
+ }
+
+ if (i >= hyp_memblock_nr)
+ return -EINVAL;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ addr = ALIGN(__io_map_base, PMD_SIZE);
+ ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE);
+ if (ret)
+ goto unlock;
+
+ ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP);
+ if (ret)
+ goto unlock;
+
+ ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker);
+
+unlock:
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+void *hyp_fixblock_map(phys_addr_t phys, size_t *size)
+{
+#ifdef HAS_FIXBLOCK
+ *size = PMD_SIZE;
+ hyp_spin_lock(&hyp_fixblock_lock);
+ return fixmap_map_slot(&hyp_fixblock_slot, phys) + offset_in_page(phys);
+#else
+ *size = PAGE_SIZE;
+ return hyp_fixmap_map(phys);
+#endif
+}
+
+void hyp_fixblock_unmap(void)
+{
+#ifdef HAS_FIXBLOCK
+ fixmap_clear_slot(&hyp_fixblock_slot);
+ hyp_spin_unlock(&hyp_fixblock_lock);
+#else
+ hyp_fixmap_unmap();
+#endif
+}
+
+int hyp_create_fixmap(void)
{
unsigned long addr, i;
int ret;
@@ -322,7 +403,7 @@ int hyp_create_pcpu_fixmap(void)
return ret;
}
- return 0;
+ return create_fixblock();
}
int hyp_create_idmap(u32 hyp_va_bits)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3927fe52a3dd..eb1c10120f9f 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -4,6 +4,8 @@
* Author: Fuad Tabba <tabba@google.com>
*/
+#include <kvm/arm_hypercalls.h>
+
#include <linux/kvm_host.h>
#include <linux/mm.h>
@@ -23,8 +25,8 @@ unsigned int kvm_arm_vmid_bits;
unsigned int kvm_host_sve_max_vl;
/*
- * The currently loaded hyp vCPU for each physical CPU. Used only when
- * protected KVM is enabled, but for both protected and non-protected VMs.
+ * The currently loaded hyp vCPU for each physical CPU. Used in protected mode
+ * for both protected and non-protected VMs.
*/
static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu);
@@ -46,7 +48,8 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
vcpu->arch.hcr_el2 |= HCR_FWB;
if (cpus_have_final_cap(ARM64_HAS_EVT) &&
- !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
+ kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0))
vcpu->arch.hcr_el2 |= HCR_TID4;
else
vcpu->arch.hcr_el2 |= HCR_TID2;
@@ -81,7 +84,7 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu)
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP))
val &= ~(HCR_AMVOFFEN);
- if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) {
+ if (!kvm_has_mte(kvm)) {
val |= HCR_TID5;
val &= ~(HCR_DCT | HCR_ATA);
}
@@ -116,8 +119,8 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu)
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
val |= MDCR_EL2_TTRF;
- if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP))
- val |= MDCR_EL2_E2TB_MASK;
+ if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
+ val &= ~MDCR_EL2_E2TB_MASK;
/* Trap Debug Communications Channel registers */
if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
@@ -134,7 +137,7 @@ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- /* Protected KVM does not support AArch32 guests. */
+ /* No AArch32 support for protected guests. */
if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) ||
kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32))
return -EINVAL;
@@ -166,8 +169,13 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
pkvm_vcpu_reset_hcr(vcpu);
- if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu)))
+ if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) {
+ struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+ /* Trust the host for non-protected vcpu features. */
+ vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2;
return 0;
+ }
ret = pkvm_check_pvm_cpu_features(vcpu);
if (ret)
@@ -175,6 +183,7 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
pvm_init_traps_hcr(vcpu);
pvm_init_traps_mdcr(vcpu);
+ vcpu_set_hcrx(vcpu);
return 0;
}
@@ -185,6 +194,11 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
*/
#define HANDLE_OFFSET 0x1000
+/*
+ * Marks a reserved but not yet used entry in the VM table.
+ */
+#define RESERVED_ENTRY ((void *)0xa110ca7ed)
+
static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
{
return handle - HANDLE_OFFSET;
@@ -203,13 +217,14 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
DEFINE_HYP_SPINLOCK(vm_table_lock);
/*
- * The table of VM entries for protected VMs in hyp.
- * Allocated at hyp initialization and setup.
+ * A table that tracks all VMs in protected mode.
+ * Allocated during hyp initialization and setup.
*/
static struct pkvm_hyp_vm **vm_table;
void pkvm_hyp_vm_table_init(void *tbl)
{
+ BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1);
WARN_ON(vm_table);
vm_table = tbl;
}
@@ -217,13 +232,19 @@ void pkvm_hyp_vm_table_init(void *tbl)
/*
* Return the hyp vm structure corresponding to the handle.
*/
-static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
{
unsigned int idx = vm_handle_to_idx(handle);
+ hyp_assert_lock_held(&vm_table_lock);
+
if (unlikely(idx >= KVM_MAX_PVMS))
return NULL;
+ /* A reserved entry doesn't represent an initialized VM. */
+ if (unlikely(vm_table[idx] == RESERVED_ENTRY))
+ return NULL;
+
return vm_table[idx];
}
@@ -239,10 +260,16 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
hyp_spin_lock(&vm_table_lock);
hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+ if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying)
+ goto unlock;
+
+ if (hyp_vm->kvm.created_vcpus <= vcpu_idx)
goto unlock;
- hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+ /* Pairs with smp_store_release() in register_hyp_vcpu(). */
+ hyp_vcpu = smp_load_acquire(&hyp_vm->vcpus[vcpu_idx]);
+ if (!hyp_vcpu)
+ goto unlock;
/* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) {
@@ -315,33 +342,44 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc
unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags);
DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
- if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags))
- set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
+ /* CTR_EL0 is always under host control, even for protected VMs. */
+ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
+
+ /* Preserve the vgic model so that GICv3 emulation works */
+ hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model;
/* No restrictions for non-protected VMs. */
if (!kvm_vm_is_protected(kvm)) {
hyp_vm->kvm.arch.flags = host_arch_flags;
+ hyp_vm->kvm.arch.flags &= ~BIT_ULL(KVM_ARCH_FLAG_ID_REGS_INITIALIZED);
bitmap_copy(kvm->arch.vcpu_features,
host_kvm->arch.vcpu_features,
KVM_VCPU_MAX_FEATURES);
+
+ if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags))
+ hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1;
+
return;
}
+ if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_MTE))
+ kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_MTE_ENABLED);
+
bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
- if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3))
+ if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PMU_V3))
set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
- if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS))
+ if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_ADDRESS))
set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
- if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC))
+ if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_PTRAUTH_GENERIC))
set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
- if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) {
+ if (kvm_pkvm_ext_allowed(kvm, KVM_CAP_ARM_SVE)) {
set_bit(KVM_ARM_VCPU_SVE, allowed_features);
kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE);
}
@@ -356,74 +394,157 @@ static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
}
+static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ void *sve_state;
+
+ if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE))
+ return;
+
+ sve_state = hyp_vcpu->vcpu.arch.sve_state;
+ hyp_unpin_shared_mem(sve_state,
+ sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu));
+}
+
static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
unsigned int nr_vcpus)
{
int i;
- for (i = 0; i < nr_vcpus; i++)
- unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+ for (i = 0; i < nr_vcpus; i++) {
+ struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i];
+
+ if (!hyp_vcpu)
+ continue;
+
+ unpin_host_vcpu(hyp_vcpu->host_vcpu);
+ unpin_host_sve_state(hyp_vcpu);
+ }
}
static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
- unsigned int nr_vcpus)
+ unsigned int nr_vcpus, pkvm_handle_t handle)
{
+ struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+ int idx = vm_handle_to_idx(handle);
+
+ hyp_vm->kvm.arch.pkvm.handle = handle;
+
hyp_vm->host_kvm = host_kvm;
hyp_vm->kvm.created_vcpus = nr_vcpus;
- hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
- hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled);
+ hyp_vm->kvm.arch.pkvm.is_protected = READ_ONCE(host_kvm->arch.pkvm.is_protected);
+ hyp_vm->kvm.arch.pkvm.is_created = true;
hyp_vm->kvm.arch.flags = 0;
pkvm_init_features_from_host(hyp_vm, host_kvm);
+
+ /* VMID 0 is reserved for the host */
+ atomic64_set(&mmu->vmid.id, idx + 1);
+
+ mmu->vtcr = host_mmu.arch.mmu.vtcr;
+ mmu->arch = &hyp_vm->kvm.arch;
+ mmu->pgt = &hyp_vm->pgt;
}
-static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
+static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
{
struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+ unsigned int sve_max_vl;
+ size_t sve_state_size;
+ void *sve_state;
+ int ret = 0;
- if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE))
+ if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) {
vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED);
+ return 0;
+ }
+
+ /* Limit guest vector length to the maximum supported by the host. */
+ sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl);
+ sve_state_size = sve_state_size_from_vl(sve_max_vl);
+ sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state));
+
+ if (!sve_state || !sve_state_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size);
+ if (ret)
+ goto err;
+
+ vcpu->arch.sve_state = sve_state;
+ vcpu->arch.sve_max_vl = sve_max_vl;
+
+ return 0;
+err:
+ clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features);
+ return ret;
+}
+
+static int vm_copy_id_regs(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+ const struct kvm *host_kvm = hyp_vm->host_kvm;
+ struct kvm *kvm = &hyp_vm->kvm;
+
+ if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &host_kvm->arch.flags))
+ return -EINVAL;
+
+ if (test_and_set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
+ return 0;
+
+ memcpy(kvm->arch.id_regs, host_kvm->arch.id_regs, sizeof(kvm->arch.id_regs));
+
+ return 0;
+}
+
+static int pkvm_vcpu_init_sysregs(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ int ret = 0;
+
+ if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+ kvm_init_pvm_id_regs(&hyp_vcpu->vcpu);
+ else
+ ret = vm_copy_id_regs(hyp_vcpu);
+
+ return ret;
}
static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
struct pkvm_hyp_vm *hyp_vm,
- struct kvm_vcpu *host_vcpu,
- unsigned int vcpu_idx)
+ struct kvm_vcpu *host_vcpu)
{
int ret = 0;
if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
return -EBUSY;
- if (host_vcpu->vcpu_idx != vcpu_idx) {
- ret = -EINVAL;
- goto done;
- }
-
hyp_vcpu->host_vcpu = host_vcpu;
hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
- hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+ hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx);
hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
- if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
- kvm_init_pvm_id_regs(&hyp_vcpu->vcpu);
+ ret = pkvm_vcpu_init_sysregs(hyp_vcpu);
+ if (ret)
+ goto done;
ret = pkvm_vcpu_init_traps(hyp_vcpu);
if (ret)
goto done;
- pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
+ ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
done:
if (ret)
unpin_host_vcpu(host_vcpu);
return ret;
}
-static int find_free_vm_table_entry(struct kvm *host_kvm)
+static int find_free_vm_table_entry(void)
{
int i;
@@ -436,15 +557,13 @@ static int find_free_vm_table_entry(struct kvm *host_kvm)
}
/*
- * Allocate a VM table entry and insert a pointer to the new vm.
+ * Reserve a VM table entry.
*
- * Return a unique handle to the protected VM on success,
+ * Return a unique handle to the VM on success,
* negative error code on failure.
*/
-static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
- struct pkvm_hyp_vm *hyp_vm)
+static int allocate_vm_table_entry(void)
{
- struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
int idx;
hyp_assert_lock_held(&vm_table_lock);
@@ -457,20 +576,57 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
if (unlikely(!vm_table))
return -EINVAL;
- idx = find_free_vm_table_entry(host_kvm);
- if (idx < 0)
+ idx = find_free_vm_table_entry();
+ if (unlikely(idx < 0))
return idx;
- hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
+ vm_table[idx] = RESERVED_ENTRY;
- /* VMID 0 is reserved for the host */
- atomic64_set(&mmu->vmid.id, idx + 1);
+ return idx;
+}
- mmu->arch = &hyp_vm->kvm.arch;
- mmu->pgt = &hyp_vm->pgt;
+static int __insert_vm_table_entry(pkvm_handle_t handle,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ unsigned int idx;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ /*
+ * Initializing protected state might have failed, yet a malicious
+ * host could trigger this function. Thus, ensure that 'vm_table'
+ * exists.
+ */
+ if (unlikely(!vm_table))
+ return -EINVAL;
+
+ idx = vm_handle_to_idx(handle);
+ if (unlikely(idx >= KVM_MAX_PVMS))
+ return -EINVAL;
+
+ if (unlikely(vm_table[idx] != RESERVED_ENTRY))
+ return -EINVAL;
vm_table[idx] = hyp_vm;
- return hyp_vm->kvm.arch.pkvm.handle;
+
+ return 0;
+}
+
+/*
+ * Insert a pointer to the initialized VM into the VM table.
+ *
+ * Return 0 on success, or negative error code on failure.
+ */
+static int insert_vm_table_entry(pkvm_handle_t handle,
+ struct pkvm_hyp_vm *hyp_vm)
+{
+ int ret;
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = __insert_vm_table_entry(handle, hyp_vm);
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
}
/*
@@ -537,10 +693,108 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
}
/*
- * Initialize the hypervisor copy of the protected VM state using the
- * memory donated by the host.
+ * Reserves an entry in the hypervisor for a new VM in protected mode.
+ *
+ * Return a unique handle to the VM on success, negative error code on failure.
+ */
+int __pkvm_reserve_vm(void)
+{
+ int ret;
+
+ hyp_spin_lock(&vm_table_lock);
+ ret = allocate_vm_table_entry();
+ hyp_spin_unlock(&vm_table_lock);
+
+ if (ret < 0)
+ return ret;
+
+ return idx_to_vm_handle(ret);
+}
+
+/*
+ * Removes a reserved entry, but only if is hasn't been used yet.
+ * Otherwise, the VM needs to be destroyed.
+ */
+void __pkvm_unreserve_vm(pkvm_handle_t handle)
+{
+ unsigned int idx = vm_handle_to_idx(handle);
+
+ if (unlikely(!vm_table))
+ return;
+
+ hyp_spin_lock(&vm_table_lock);
+ if (likely(idx < KVM_MAX_PVMS && vm_table[idx] == RESERVED_ENTRY))
+ remove_vm_table_entry(handle);
+ hyp_spin_unlock(&vm_table_lock);
+}
+
+#ifdef CONFIG_NVHE_EL2_DEBUG
+static struct pkvm_hyp_vm selftest_vm = {
+ .kvm = {
+ .arch = {
+ .mmu = {
+ .arch = &selftest_vm.kvm.arch,
+ .pgt = &selftest_vm.pgt,
+ },
+ },
+ },
+};
+
+static struct pkvm_hyp_vcpu selftest_vcpu = {
+ .vcpu = {
+ .arch = {
+ .hw_mmu = &selftest_vm.kvm.arch.mmu,
+ },
+ .kvm = &selftest_vm.kvm,
+ },
+};
+
+struct pkvm_hyp_vcpu *init_selftest_vm(void *virt)
+{
+ struct hyp_page *p = hyp_virt_to_page(virt);
+ unsigned long min_pages, seeded = 0;
+ int i;
+
+ selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
+ WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
+
+ /*
+ * Mirror pkvm_refill_memcache() for the share/donate pre-checks;
+ * the selftest invokes those functions directly and would
+ * otherwise see an empty memcache.
+ */
+ min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu);
+
+ for (i = 0; i < pkvm_selftest_pages(); i++) {
+ if (p[i].refcount)
+ continue;
+ p[i].refcount = 1;
+ if (seeded < min_pages) {
+ push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache,
+ hyp_page_to_virt(&p[i]), hyp_virt_to_phys);
+ seeded++;
+ } else {
+ hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
+ }
+ }
+
+ selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm();
+ insert_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle, &selftest_vm);
+ return &selftest_vcpu;
+}
+
+void teardown_selftest_vm(void)
+{
+ hyp_spin_lock(&vm_table_lock);
+ remove_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle);
+ hyp_spin_unlock(&vm_table_lock);
+}
+#endif /* CONFIG_NVHE_EL2_DEBUG */
+
+/*
+ * Initialize the hypervisor copy of the VM state using host-donated memory.
*
- * Unmaps the donated memory from the host at stage 2.
+ * Unmap the donated memory from the host at stage 2.
*
* host_kvm: A pointer to the host's struct kvm.
* vm_hva: The host va of the area being donated for the VM state.
@@ -549,8 +803,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
* the VM. Must be page aligned. Its size is implied by the VM's
* VTCR.
*
- * Return a unique handle to the protected VM on success,
- * negative error code on failure.
+ * Return 0 success, negative error code on failure.
*/
int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
unsigned long pgd_hva)
@@ -558,6 +811,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
struct pkvm_hyp_vm *hyp_vm = NULL;
size_t vm_size, pgd_size;
unsigned int nr_vcpus;
+ pkvm_handle_t handle;
void *pgd = NULL;
int ret;
@@ -571,6 +825,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
goto err_unpin_kvm;
}
+ handle = READ_ONCE(host_kvm->arch.pkvm.handle);
+ if (unlikely(handle < HANDLE_OFFSET)) {
+ ret = -EINVAL;
+ goto err_unpin_kvm;
+ }
+
vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
@@ -584,24 +844,19 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
if (!pgd)
goto err_remove_mappings;
- init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
-
- hyp_spin_lock(&vm_table_lock);
- ret = insert_vm_table_entry(host_kvm, hyp_vm);
- if (ret < 0)
- goto err_unlock;
+ init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus, handle);
ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
if (ret)
- goto err_remove_vm_table_entry;
- hyp_spin_unlock(&vm_table_lock);
+ goto err_remove_mappings;
- return hyp_vm->kvm.arch.pkvm.handle;
+ /* Must be called last since this publishes the VM. */
+ ret = insert_vm_table_entry(handle, hyp_vm);
+ if (ret)
+ goto err_remove_mappings;
+
+ return 0;
-err_remove_vm_table_entry:
- remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
-err_unlock:
- hyp_spin_unlock(&vm_table_lock);
err_remove_mappings:
unmap_donated_memory(hyp_vm, vm_size);
unmap_donated_memory(pgd, pgd_size);
@@ -611,22 +866,39 @@ err_unpin_kvm:
}
/*
- * Initialize the hypervisor copy of the protected vCPU state using the
- * memory donated by the host.
+ * Initialize the hypervisor copy of the vCPU state using host-donated memory.
*
- * handle: The handle for the protected vm.
+ * handle: The hypervisor handle for the vm.
* host_vcpu: A pointer to the corresponding host vcpu.
* vcpu_hva: The host va of the area being donated for the vcpu state.
* Must be page aligned. The size of the area must be equal to
* the page-aligned size of 'struct pkvm_hyp_vcpu'.
* Return 0 on success, negative error code on failure.
*/
+static int register_hyp_vcpu(struct pkvm_hyp_vm *hyp_vm,
+ struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+ unsigned int idx = hyp_vcpu->vcpu.vcpu_idx;
+
+ if (idx >= hyp_vm->kvm.created_vcpus)
+ return -EINVAL;
+
+ if (hyp_vm->vcpus[idx])
+ return -EINVAL;
+
+ /*
+ * Ensure the hyp_vcpu is initialised before publishing it to
+ * the vCPU-load path via 'hyp_vm->vcpus[]'.
+ */
+ smp_store_release(&hyp_vm->vcpus[idx], hyp_vcpu);
+ return 0;
+}
+
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
unsigned long vcpu_hva)
{
struct pkvm_hyp_vcpu *hyp_vcpu;
struct pkvm_hyp_vm *hyp_vm;
- unsigned int idx;
int ret;
hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
@@ -641,27 +913,21 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
goto unlock;
}
- idx = hyp_vm->nr_vcpus;
- if (idx >= hyp_vm->kvm.created_vcpus) {
- ret = -EINVAL;
- goto unlock;
- }
-
- ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
+ ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu);
if (ret)
goto unlock;
- hyp_vm->vcpus[idx] = hyp_vcpu;
- hyp_vm->nr_vcpus++;
+ ret = register_hyp_vcpu(hyp_vm, hyp_vcpu);
+ if (ret) {
+ unpin_host_vcpu(host_vcpu);
+ unpin_host_sve_state(hyp_vcpu);
+ }
unlock:
hyp_spin_unlock(&vm_table_lock);
- if (ret) {
+ if (ret)
unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
- return ret;
- }
-
- return 0;
+ return ret;
}
static void
@@ -676,9 +942,56 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
unmap_donated_memory_noclear(addr, size);
}
-int __pkvm_teardown_vm(pkvm_handle_t handle)
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn)
+{
+ struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle);
+ int ret = -EINVAL;
+
+ if (!hyp_vm)
+ return ret;
+
+ if (hyp_vm->kvm.arch.pkvm.is_dying)
+ ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm);
+
+ put_pkvm_hyp_vm(hyp_vm);
+ return ret;
+}
+
+static struct pkvm_hyp_vm *get_pkvm_unref_hyp_vm_locked(pkvm_handle_t handle)
+{
+ struct pkvm_hyp_vm *hyp_vm;
+
+ hyp_assert_lock_held(&vm_table_lock);
+
+ hyp_vm = get_vm_by_handle(handle);
+ if (!hyp_vm || hyp_page_count(hyp_vm))
+ return NULL;
+
+ return hyp_vm;
+}
+
+int __pkvm_start_teardown_vm(pkvm_handle_t handle)
{
- struct kvm_hyp_memcache *mc;
+ struct pkvm_hyp_vm *hyp_vm;
+ int ret = 0;
+
+ hyp_spin_lock(&vm_table_lock);
+ hyp_vm = get_pkvm_unref_hyp_vm_locked(handle);
+ if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ hyp_vm->kvm.arch.pkvm.is_dying = true;
+unlock:
+ hyp_spin_unlock(&vm_table_lock);
+
+ return ret;
+}
+
+int __pkvm_finalize_teardown_vm(pkvm_handle_t handle)
+{
+ struct kvm_hyp_memcache *mc, *stage2_mc;
struct pkvm_hyp_vm *hyp_vm;
struct kvm *host_kvm;
unsigned int idx;
@@ -686,14 +999,9 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
int err;
hyp_spin_lock(&vm_table_lock);
- hyp_vm = get_vm_by_handle(handle);
- if (!hyp_vm) {
- err = -ENOENT;
- goto err_unlock;
- }
-
- if (WARN_ON(hyp_page_count(hyp_vm))) {
- err = -EBUSY;
+ hyp_vm = get_pkvm_unref_hyp_vm_locked(handle);
+ if (!hyp_vm || !hyp_vm->kvm.arch.pkvm.is_dying) {
+ err = -EINVAL;
goto err_unlock;
}
@@ -706,18 +1014,24 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
/* Reclaim guest pages (including page-table pages) */
mc = &host_kvm->arch.pkvm.teardown_mc;
- reclaim_guest_pages(hyp_vm, mc);
- unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+ stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc;
+ reclaim_pgtable_pages(hyp_vm, stage2_mc);
+ unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus);
/* Push the metadata pages to the teardown memcache */
- for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+ for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) {
struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
- struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
+ struct kvm_hyp_memcache *vcpu_mc;
+
+ if (!hyp_vcpu)
+ continue;
+
+ vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
while (vcpu_mc->nr_pages) {
void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
- push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+ push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys);
unmap_donated_memory_noclear(addr, PAGE_SIZE);
}
@@ -733,3 +1047,121 @@ err_unlock:
hyp_spin_unlock(&vm_table_lock);
return err;
}
+
+static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa)
+{
+ u64 elr;
+
+ /* Fake up a data abort (level 3 translation fault on write) */
+ vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) |
+ ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
+ FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
+
+ /* Shuffle the IPA around into the HPFAR */
+ vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK;
+
+ /* This is a virtual address. 0's good. Let's go with 0. */
+ vcpu->arch.fault.far_el2 = 0;
+
+ /* Rewind the ELR so we return to the HVC once the IPA is mapped */
+ elr = read_sysreg(elr_el2);
+ elr -= 4;
+ write_sysreg(elr, elr_el2);
+
+ return ARM_EXCEPTION_TRAP;
+}
+
+static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ u64 ipa = smccc_get_arg1(vcpu);
+
+ if (!PAGE_ALIGNED(ipa))
+ goto out_guest;
+
+ hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+ switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) {
+ case 0:
+ ret[0] = SMCCC_RET_SUCCESS;
+ goto out_guest;
+ case -ENOENT:
+ /*
+ * Convert the exception into a data abort so that the page
+ * being shared is mapped into the guest next time.
+ */
+ *exit_code = __pkvm_memshare_page_req(vcpu, ipa);
+ goto out_host;
+ }
+
+out_guest:
+ return true;
+out_host:
+ return false;
+}
+
+static void pkvm_memunshare_call(u64 *ret, struct kvm_vcpu *vcpu)
+{
+ struct pkvm_hyp_vcpu *hyp_vcpu;
+ u64 ipa = smccc_get_arg1(vcpu);
+
+ if (!PAGE_ALIGNED(ipa))
+ return;
+
+ hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+ if (!__pkvm_guest_unshare_host(hyp_vcpu, hyp_phys_to_pfn(ipa)))
+ ret[0] = SMCCC_RET_SUCCESS;
+}
+
+/*
+ * Handler for protected VM HVC calls.
+ *
+ * Returns true if the hypervisor has handled the exit (and control
+ * should return to the guest) or false if it hasn't (and the handling
+ * should be performed by the host).
+ */
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+ u64 val[4] = { SMCCC_RET_INVALID_PARAMETER };
+ bool handled = true;
+
+ switch (smccc_get_function(vcpu)) {
+ case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+ val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
+ if (smccc_get_arg1(vcpu) ||
+ smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ val[0] = PAGE_SIZE;
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+ if (smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ handled = pkvm_memshare_call(val, vcpu, exit_code);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+ if (smccc_get_arg2(vcpu) ||
+ smccc_get_arg3(vcpu)) {
+ break;
+ }
+
+ pkvm_memunshare_call(val, vcpu);
+ break;
+ default:
+ /* Punt everything else back to the host, for now. */
+ handled = false;
+ }
+
+ if (handled)
+ smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
+ return handled;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index c3e196fb8b18..e20db999e328 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -6,11 +6,12 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
#include <asm/kvm_mmu.h>
-#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <uapi/linux/psci.h>
+#include <nvhe/arm-smccc.h>
#include <nvhe/memory.h>
#include <nvhe/trap_handler.h>
@@ -65,7 +66,7 @@ static unsigned long psci_call(unsigned long fn, unsigned long arg0,
{
struct arm_smccc_res res;
- arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
+ hyp_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
return res.a0;
}
@@ -200,30 +201,42 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
__hyp_pa(init_params), 0);
}
-asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
+static void __noreturn __kvm_host_psci_cpu_entry(unsigned long pc, unsigned long r0)
{
- struct psci_boot_args *boot_args;
- struct kvm_cpu_context *host_ctxt;
-
- host_ctxt = host_data_ptr(host_ctxt);
+ struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt);
- if (is_cpu_on)
- boot_args = this_cpu_ptr(&cpu_on_args);
- else
- boot_args = this_cpu_ptr(&suspend_args);
+ trace_hyp_enter(host_ctxt, HYP_REASON_PSCI);
- cpu_reg(host_ctxt, 0) = boot_args->r0;
- write_sysreg_el2(boot_args->pc, SYS_ELR);
-
- if (is_cpu_on)
- release_boot_args(boot_args);
+ cpu_reg(host_ctxt, 0) = r0;
+ write_sysreg_el2(pc, SYS_ELR);
write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);
+ trace_hyp_exit(host_ctxt, HYP_REASON_PSCI);
__host_enter(host_ctxt);
}
+asmlinkage void __noreturn __kvm_host_psci_cpu_on_entry(void)
+{
+ struct psci_boot_args *boot_args = this_cpu_ptr(&cpu_on_args);
+ unsigned long pc, r0;
+
+ pc = READ_ONCE(boot_args->pc);
+ r0 = READ_ONCE(boot_args->r0);
+
+ release_boot_args(boot_args);
+
+ __kvm_host_psci_cpu_entry(pc, r0);
+}
+
+asmlinkage void __noreturn __kvm_host_psci_cpu_resume_entry(void)
+{
+ struct psci_boot_args *boot_args = this_cpu_ptr(&suspend_args);
+
+ __kvm_host_psci_cpu_entry(boot_args->pc, boot_args->r0);
+}
+
static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
{
if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id))
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index d62bcb5634a2..d461981616d9 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -28,6 +28,7 @@ static void *vmemmap_base;
static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
+static void *selftest_base;
static void *ffa_proxy_pages;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static struct hyp_pool hpool;
@@ -38,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
hyp_early_alloc_init(virt, size);
+ nr_pages = pkvm_selftest_pages();
+ selftest_base = hyp_early_alloc_contig(nr_pages);
+ if (nr_pages && !selftest_base)
+ return -ENOMEM;
+
nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
vmemmap_base = hyp_early_alloc_contig(nr_pages);
if (!vmemmap_base)
@@ -119,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
+ ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP);
+ if (ret)
+ return ret;
+
ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
if (ret)
return ret;
@@ -180,7 +190,9 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
enum pkvm_page_state state;
+ struct hyp_page *page;
phys_addr_t phys;
+ enum kvm_pgtable_prot prot;
if (!kvm_pte_valid(ctx->old))
return 0;
@@ -192,19 +204,32 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
if (!addr_is_memory(phys))
return -EINVAL;
+ page = hyp_phys_to_page(phys);
+
/*
* Adjust the host stage-2 mappings to match the ownership attributes
- * configured in the hypervisor stage-1.
+ * configured in the hypervisor stage-1, and make sure to propagate them
+ * to the hyp_vmemmap state.
*/
- state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
+ prot = kvm_pgtable_hyp_pte_prot(ctx->old);
+ state = pkvm_getstate(prot);
switch (state) {
case PKVM_PAGE_OWNED:
- return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ set_hyp_state(page, PKVM_PAGE_OWNED);
+ /* hyp text is RO in the host stage-2 to be inspected on panic. */
+ if (prot == PAGE_HYP_EXEC) {
+ set_host_state(page, PKVM_NOPAGE);
+ return host_stage2_idmap_locked(phys, PAGE_SIZE, KVM_PGTABLE_PROT_R);
+ } else {
+ return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
+ }
case PKVM_PAGE_SHARED_OWNED:
- hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED;
+ set_hyp_state(page, PKVM_PAGE_SHARED_OWNED);
+ set_host_state(page, PKVM_PAGE_SHARED_BORROWED);
break;
case PKVM_PAGE_SHARED_BORROWED:
- hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED;
+ set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED);
+ set_host_state(page, PKVM_PAGE_SHARED_OWNED);
break;
default:
return -EINVAL;
@@ -287,15 +312,15 @@ void __noreturn __pkvm_init_finalise(void)
};
pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
- ret = fix_host_ownership();
+ ret = fix_hyp_pgtable_refcnt();
if (ret)
goto out;
- ret = fix_hyp_pgtable_refcnt();
+ ret = hyp_create_fixmap();
if (ret)
goto out;
- ret = hyp_create_pcpu_fixmap();
+ ret = fix_host_ownership();
if (ret)
goto out;
@@ -304,6 +329,8 @@ void __noreturn __pkvm_init_finalise(void)
goto out;
pkvm_hyp_vm_table_init(vm_table_base);
+
+ pkvm_ownership_selftest(selftest_base);
out:
/*
* We tail-called to here from handle___pkvm_init() and will not return,
@@ -314,8 +341,7 @@ out:
__host_enter(host_ctxt);
}
-int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
- unsigned long *per_cpu_base, u32 hyp_va_bits)
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits)
{
struct kvm_nvhe_init_params *params;
void *virt = hyp_phys_to_virt(phys);
@@ -328,7 +354,6 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
return -EINVAL;
hyp_spin_lock_init(&pkvm_pgd_lock);
- hyp_nr_cpus = nr_cpus;
ret = divide_memory_pool(virt, size);
if (ret)
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
index 5b6eeab1a774..7c832d60d22b 100644
--- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -34,7 +34,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
stacktrace_info->pc = pc;
}
-#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE
+#ifdef CONFIG_PKVM_STACKTRACE
#include <asm/stacktrace/nvhe.h>
DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace);
@@ -134,11 +134,11 @@ static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
unwind(&state, pkvm_save_backtrace_entry, &idx);
}
-#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */
+#else /* !CONFIG_PKVM_STACKTRACE */
static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
{
}
-#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */
+#endif /* CONFIG_PKVM_STACKTRACE */
/*
* kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 7d2ba6ef0261..8d1df3d33595 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -7,7 +7,6 @@
#include <hyp/switch.h>
#include <hyp/sysreg-sr.h>
-#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -21,6 +20,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
#include <asm/kvm_mmu.h>
#include <asm/fpsimd.h>
#include <asm/debug-monitors.h>
@@ -33,70 +33,30 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
-extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
-
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
- u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
-
- if (!guest_owns_fp_regs())
- __activate_traps_fpsimd32(vcpu);
-
- if (has_hvhe()) {
- val |= CPACR_EL1_TTA;
-
- if (guest_owns_fp_regs()) {
- val |= CPACR_EL1_FPEN;
- if (vcpu_has_sve(vcpu))
- val |= CPACR_EL1_ZEN;
- }
-
- write_sysreg(val, cpacr_el1);
- } else {
- val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
-
- /*
- * Always trap SME since it's not supported in KVM.
- * TSM is RES1 if SME isn't implemented.
- */
- val |= CPTR_EL2_TSM;
-
- if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
- val |= CPTR_EL2_TZ;
-
- if (!guest_owns_fp_regs())
- val |= CPTR_EL2_TFP;
+struct fgt_masks hfgrtr_masks;
+struct fgt_masks hfgwtr_masks;
+struct fgt_masks hfgitr_masks;
+struct fgt_masks hdfgrtr_masks;
+struct fgt_masks hdfgwtr_masks;
+struct fgt_masks hafgrtr_masks;
+struct fgt_masks hfgrtr2_masks;
+struct fgt_masks hfgwtr2_masks;
+struct fgt_masks hfgitr2_masks;
+struct fgt_masks hdfgrtr2_masks;
+struct fgt_masks hdfgwtr2_masks;
+struct fgt_masks ich_hfgrtr_masks;
+struct fgt_masks ich_hfgwtr_masks;
+struct fgt_masks ich_hfgitr_masks;
- write_sysreg(val, cptr_el2);
- }
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
- if (has_hvhe()) {
- u64 val = CPACR_EL1_FPEN;
-
- if (cpus_have_final_cap(ARM64_SVE))
- val |= CPACR_EL1_ZEN;
- if (cpus_have_final_cap(ARM64_SME))
- val |= CPACR_EL1_SMEN;
-
- write_sysreg(val, cpacr_el1);
- } else {
- u64 val = CPTR_NVHE_EL2_RES1;
-
- if (!cpus_have_final_cap(ARM64_SVE))
- val |= CPTR_EL2_TZ;
- if (!cpus_have_final_cap(ARM64_SME))
- val |= CPTR_EL2_TSM;
-
- write_sysreg(val, cptr_el2);
- }
-}
+extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
static void __activate_traps(struct kvm_vcpu *vcpu)
{
___activate_traps(vcpu, vcpu->arch.hcr_el2);
+
+ *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
+ write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+
__activate_traps_common(vcpu);
__activate_cptr_traps(vcpu);
@@ -140,9 +100,11 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
isb();
}
+ write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
+
__deactivate_traps_common(vcpu);
- write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
+ write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2);
__deactivate_cptr_traps(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
@@ -151,6 +113,12 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
/* Save VGICv3 state on non-VHE systems */
static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
{
+ if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) {
+ __vgic_v5_save_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ __vgic_v5_save_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ return;
+ }
+
if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
__vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
__vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
@@ -160,6 +128,12 @@ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
/* Restore VGICv3 state on non-VHE systems */
static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
{
+ if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) {
+ __vgic_v5_restore_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ __vgic_v5_restore_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5);
+ return;
+ }
+
if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
__vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
__vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
@@ -231,6 +205,7 @@ static const exit_handler_fn hyp_exit_handlers[] = {
static const exit_handler_fn pvm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = NULL,
+ [ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64,
[ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64,
[ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted,
[ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd,
@@ -252,7 +227,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu);
- synchronize_vcpu_pstate(vcpu, exit_code);
+ synchronize_vcpu_pstate(vcpu);
/*
* Some guests (e.g., protected VMs) are not be allowed to run in
@@ -319,7 +294,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
* We're about to restore some new MMU state. Make sure
* ongoing page-table walks that have started before we
* trapped to EL2 have completed. This also synchronises the
- * above disabling of SPE and TRBE.
+ * above disabling of BRBE.
*
* See DDI0487I.a D8.1.5 "Out-of-context translation regimes",
* rule R_LFHQG and subsequent information statements.
@@ -349,10 +324,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__debug_switch_to_guest(vcpu);
do {
+ trace_hyp_exit(host_ctxt, HYP_REASON_ERET_GUEST);
+
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
/* And we're baaack! */
+ trace_hyp_enter(host_ctxt, HYP_REASON_GUEST_EXIT);
} while (fixup_guest_exit(vcpu, &exit_code));
__sysreg_save_state_nvhe(guest_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 1ddd9ed3cbb3..8c3fbb413a06 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -20,6 +20,7 @@
*/
u64 id_aa64pfr0_el1_sys_val;
u64 id_aa64pfr1_el1_sys_val;
+u64 id_aa64pfr2_el1_sys_val;
u64 id_aa64isar0_el1_sys_val;
u64 id_aa64isar1_el1_sys_val;
u64 id_aa64isar2_el1_sys_val;
@@ -108,6 +109,11 @@ static const struct pvm_ftr_bits pvmid_aa64pfr1[] = {
FEAT_END
};
+static const struct pvm_ftr_bits pvmid_aa64pfr2[] = {
+ MAX_FEAT(ID_AA64PFR2_EL1, GCIE, NI),
+ FEAT_END
+};
+
static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = {
MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40),
MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16),
@@ -134,7 +140,7 @@ static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = {
MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP),
MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP),
MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP),
- MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18),
+ MAX_FEAT(ID_AA64MMFR2_EL1, IDS, IMP),
MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP),
MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2),
MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP),
@@ -221,6 +227,8 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id)
return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0);
case SYS_ID_AA64PFR1_EL1:
return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1);
+ case SYS_ID_AA64PFR2_EL1:
+ return get_restricted_features(vcpu, id_aa64pfr2_el1_sys_val, pvmid_aa64pfr2);
case SYS_ID_AA64ISAR0_EL1:
return id_aa64isar0_el1_sys_val;
case SYS_ID_AA64ISAR1_EL1:
@@ -243,17 +251,17 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id)
}
}
-/*
- * Inject an unknown/undefined exception to an AArch64 guest while most of its
- * sysregs are live.
- */
-static void inject_undef64(struct kvm_vcpu *vcpu)
+static void inject_sync64(struct kvm_vcpu *vcpu, u64 esr)
{
- u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
-
*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
*vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+ /*
+ * Make sure we have the latest update to VBAR_EL1, as pKVM
+ * handles traps very early, before sysregs are resync'ed
+ */
+ __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR));
+
kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
__kvm_adjust_pc(vcpu);
@@ -264,6 +272,15 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
}
+/*
+ * Inject an unknown/undefined exception to an AArch64 guest while most of its
+ * sysregs are live.
+ */
+static void inject_undef64(struct kvm_vcpu *vcpu)
+{
+ inject_sync64(vcpu, (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT));
+}
+
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
struct sys_reg_desc const *r)
{
@@ -338,6 +355,18 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu,
return true;
}
+static bool pvm_idst_access(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, IDS, IMP))
+ inject_sync64(vcpu, kvm_vcpu_get_esr(vcpu));
+ else
+ inject_undef64(vcpu);
+
+ return false;
+}
+
/* Mark the specified system register as an AArch32 feature id register. */
#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 }
@@ -371,6 +400,17 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Cache maintenance by set/way operations are restricted. */
/* Debug and Trace Registers are restricted. */
+ RAZ_WI(SYS_DBGBVRn_EL1(0)),
+ RAZ_WI(SYS_DBGBCRn_EL1(0)),
+ RAZ_WI(SYS_DBGWVRn_EL1(0)),
+ RAZ_WI(SYS_DBGWCRn_EL1(0)),
+ RAZ_WI(SYS_MDSCR_EL1),
+ RAZ_WI(SYS_OSLAR_EL1),
+ RAZ_WI(SYS_OSLSR_EL1),
+ RAZ_WI(SYS_OSDLR_EL1),
+
+ /* Group 1 ID registers */
+ HOST_HANDLED(SYS_REVIDR_EL1),
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
@@ -407,7 +447,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* CRm=4 */
AARCH64(SYS_ID_AA64PFR0_EL1),
AARCH64(SYS_ID_AA64PFR1_EL1),
- ID_UNALLOCATED(4,2),
+ AARCH64(SYS_ID_AA64PFR2_EL1),
ID_UNALLOCATED(4,3),
AARCH64(SYS_ID_AA64ZFR0_EL1),
ID_UNALLOCATED(4,5),
@@ -440,6 +480,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Scalable Vector Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_PMR_EL1),
+
RAZ_WI(SYS_ERRIDR_EL1),
RAZ_WI(SYS_ERRSELR_EL1),
RAZ_WI(SYS_ERXFR_EL1),
@@ -453,13 +495,20 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Limited Ordering Regions Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_DIR_EL1),
+ HOST_HANDLED(SYS_ICC_RPR_EL1),
HOST_HANDLED(SYS_ICC_SGI1R_EL1),
HOST_HANDLED(SYS_ICC_ASGI1R_EL1),
HOST_HANDLED(SYS_ICC_SGI0R_EL1),
+ HOST_HANDLED(SYS_ICC_CTLR_EL1),
{ SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, },
HOST_HANDLED(SYS_CCSIDR_EL1),
HOST_HANDLED(SYS_CLIDR_EL1),
+ { SYS_DESC(SYS_CCSIDR2_EL1), .access = pvm_idst_access },
+ { SYS_DESC(SYS_GMID_EL1), .access = pvm_idst_access },
+ { SYS_DESC(SYS_SMIDR_EL1), .access = pvm_idst_access },
+ HOST_HANDLED(SYS_AIDR_EL1),
HOST_HANDLED(SYS_CSSELR_EL1),
HOST_HANDLED(SYS_CTR_EL0),
diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
index dba101565de3..3cc613cce5f5 100644
--- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c
@@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
{
- __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1));
+ u64 midr = ctxt_midr_el1(ctxt);
+
+ __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1));
__sysreg_restore_common_state(ctxt);
__sysreg_restore_user_state(ctxt);
__sysreg_restore_el2_return_state(ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 48da9ca9763f..b29140995d48 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
- ipa >>= 12;
__tlbi_level(ipas2e1is, ipa, level);
/*
@@ -169,7 +168,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
*/
dsb(ish);
__tlbi(vmalle1is);
- dsb(ish);
+ __tlbi_sync_s1ish_hyp();
isb();
exit_vmid_context(&cxt);
@@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
- ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*
@@ -226,7 +224,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
dsb(ish);
__tlbi(vmalle1is);
- dsb(ish);
+ __tlbi_sync_s1ish_hyp();
isb();
exit_vmid_context(&cxt);
@@ -240,7 +238,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
enter_vmid_context(mmu, &cxt, false);
__tlbi(vmalls12e1is);
- dsb(ish);
+ __tlbi_sync_s1ish_hyp();
isb();
exit_vmid_context(&cxt);
@@ -266,5 +264,5 @@ void __kvm_flush_vm_context(void)
/* Same remark as in enter_vmid_context() */
dsb(ish);
__tlbi(alle1is);
- dsb(ish);
+ __tlbi_sync_s1ish_hyp();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
new file mode 100644
index 000000000000..e7e150ab265f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/trace.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/clock.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trace.h>
+
+#include <asm/percpu.h>
+#include <asm/kvm_mmu.h>
+#include <asm/local.h>
+
+#include "simple_ring_buffer.c"
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs);
+
+static struct hyp_trace_buffer {
+ struct simple_rb_per_cpu __percpu *simple_rbs;
+ void *bpages_backing_start;
+ size_t bpages_backing_size;
+ hyp_spinlock_t lock;
+} trace_buffer = {
+ .simple_rbs = &__simple_rbs,
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+};
+
+static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer)
+{
+ return trace_buffer->bpages_backing_size > 0;
+}
+
+void *tracing_reserve_entry(unsigned long length)
+{
+ return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length,
+ trace_clock());
+}
+
+void tracing_commit_entry(void)
+{
+ simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs));
+}
+
+static int __admit_host_mem(void *start, u64 size)
+{
+ if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size) || !size)
+ return -EINVAL;
+
+ if (!is_protected_kvm_enabled())
+ return 0;
+
+ return __pkvm_host_donate_hyp(hyp_virt_to_pfn(start), size >> PAGE_SHIFT);
+}
+
+static void __release_host_mem(void *start, u64 size)
+{
+ if (!is_protected_kvm_enabled())
+ return;
+
+ WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT));
+}
+
+static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer,
+ struct hyp_trace_desc *desc)
+{
+ void *start = (void *)kern_hyp_va(desc->bpages_backing_start);
+ size_t size = desc->bpages_backing_size;
+ int ret;
+
+ ret = __admit_host_mem(start, size);
+ if (ret)
+ return ret;
+
+ memset(start, 0, size);
+
+ trace_buffer->bpages_backing_start = start;
+ trace_buffer->bpages_backing_size = size;
+
+ return 0;
+}
+
+static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer)
+{
+ void *start = trace_buffer->bpages_backing_start;
+ size_t size = trace_buffer->bpages_backing_size;
+
+ if (!size)
+ return;
+
+ memset(start, 0, size);
+
+ __release_host_mem(start, size);
+
+ trace_buffer->bpages_backing_start = 0;
+ trace_buffer->bpages_backing_size = 0;
+}
+
+static void *__pin_shared_page(unsigned long kern_va)
+{
+ void *va = kern_hyp_va((void *)kern_va);
+
+ if (!is_protected_kvm_enabled())
+ return va;
+
+ return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va;
+}
+
+static void __unpin_shared_page(void *va)
+{
+ if (!is_protected_kvm_enabled())
+ return;
+
+ hyp_unpin_shared_mem(va, va + PAGE_SIZE);
+}
+
+static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer)
+{
+ int cpu;
+
+ hyp_assert_lock_held(&trace_buffer->lock);
+
+ if (!hyp_trace_buffer_loaded(trace_buffer))
+ return;
+
+ for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
+ simple_ring_buffer_unload_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
+ __unpin_shared_page);
+
+ hyp_trace_buffer_unload_bpage_backing(trace_buffer);
+}
+
+static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer,
+ struct hyp_trace_desc *desc)
+{
+ struct simple_buffer_page *bpages;
+ struct ring_buffer_desc *rb_desc;
+ int ret, cpu;
+
+ hyp_assert_lock_held(&trace_buffer->lock);
+
+ if (hyp_trace_buffer_loaded(trace_buffer))
+ return -EINVAL;
+
+ ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc);
+ if (ret)
+ return ret;
+
+ bpages = trace_buffer->bpages_backing_start;
+ for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
+ ret = simple_ring_buffer_init_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
+ bpages, rb_desc, __pin_shared_page,
+ __unpin_shared_page);
+ if (ret)
+ break;
+
+ bpages += rb_desc->nr_page_va;
+ }
+
+ if (ret)
+ hyp_trace_buffer_unload(trace_buffer);
+
+ return ret;
+}
+
+static bool hyp_trace_desc_is_valid(struct hyp_trace_desc *desc, size_t desc_size)
+{
+ struct ring_buffer_desc *rb_desc;
+ unsigned int cpu;
+ size_t nr_bpages;
+ void *desc_end;
+
+ if (!is_protected_kvm_enabled())
+ return true;
+
+ /*
+ * Both desc_size and bpages_backing_size are untrusted host-provided
+ * values. We rely on __pkvm_host_donate_hyp() to enforce their validity.
+ */
+ desc_end = (void *)desc + desc_size;
+ nr_bpages = desc->bpages_backing_size / sizeof(struct simple_buffer_page);
+
+ for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
+ /* Can we read nr_page_va? */
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end)
+ return false;
+
+ /* Overflow desc? */
+ if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end)
+ return false;
+
+ /* Overflow bpages backing memory? */
+ if (nr_bpages < rb_desc->nr_page_va)
+ return false;
+
+ if (cpu >= hyp_nr_cpus)
+ return false;
+
+ if (cpu != rb_desc->cpu)
+ return false;
+
+ nr_bpages -= rb_desc->nr_page_va;
+ }
+
+ return true;
+}
+
+int __tracing_load(unsigned long desc_hva, size_t desc_size)
+{
+ struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva);
+ int ret;
+
+ ret = __admit_host_mem(desc, desc_size);
+ if (ret)
+ return ret;
+
+ if (!hyp_trace_desc_is_valid(desc, desc_size)) {
+ ret = -EINVAL;
+ goto err_release_desc;
+ }
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ ret = hyp_trace_buffer_load(&trace_buffer, desc);
+
+ hyp_spin_unlock(&trace_buffer.lock);
+
+err_release_desc:
+ __release_host_mem(desc, desc_size);
+ return ret;
+}
+
+void __tracing_unload(void)
+{
+ hyp_spin_lock(&trace_buffer.lock);
+ hyp_trace_buffer_unload(&trace_buffer);
+ hyp_spin_unlock(&trace_buffer.lock);
+}
+
+int __tracing_enable(bool enable)
+{
+ int cpu, ret = enable ? -EINVAL : 0;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ if (!hyp_trace_buffer_loaded(&trace_buffer))
+ goto unlock;
+
+ for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
+ simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu),
+ enable);
+
+ ret = 0;
+
+unlock:
+ hyp_spin_unlock(&trace_buffer.lock);
+
+ return ret;
+}
+
+int __tracing_swap_reader(unsigned int cpu)
+{
+ int ret = -ENODEV;
+
+ if (cpu >= hyp_nr_cpus)
+ return -EINVAL;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ if (hyp_trace_buffer_loaded(&trace_buffer))
+ ret = simple_ring_buffer_swap_reader_page(
+ per_cpu_ptr(trace_buffer.simple_rbs, cpu));
+
+ hyp_spin_unlock(&trace_buffer.lock);
+
+ return ret;
+}
+
+void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
+{
+ int cpu;
+
+ /* After this loop, all CPUs are observing the new bank... */
+ for (cpu = 0; cpu < hyp_nr_cpus; cpu++) {
+ struct simple_rb_per_cpu *simple_rb = per_cpu_ptr(trace_buffer.simple_rbs, cpu);
+
+ while (READ_ONCE(simple_rb->status) == SIMPLE_RB_WRITING)
+ ;
+ }
+
+ /* ...we can now override the old one and swap. */
+ trace_clock_update(mult, shift, epoch_ns, epoch_cyc);
+}
+
+int __tracing_reset(unsigned int cpu)
+{
+ int ret = -ENODEV;
+
+ if (cpu >= hyp_nr_cpus)
+ return -EINVAL;
+
+ hyp_spin_lock(&trace_buffer.lock);
+
+ if (hyp_trace_buffer_loaded(&trace_buffer))
+ ret = simple_ring_buffer_reset(per_cpu_ptr(trace_buffer.simple_rbs, cpu));
+
+ hyp_spin_unlock(&trace_buffer.lock);
+
+ return ret;
+}