summaryrefslogtreecommitdiff
path: root/arch/s390/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/kvm')
-rw-r--r--arch/s390/kvm/Kconfig4
-rw-r--r--arch/s390/kvm/Makefile3
-rw-r--r--arch/s390/kvm/dat.c1321
-rw-r--r--arch/s390/kvm/dat.h976
-rw-r--r--arch/s390/kvm/diag.c32
-rw-r--r--arch/s390/kvm/faultin.c148
-rw-r--r--arch/s390/kvm/faultin.h92
-rw-r--r--arch/s390/kvm/gaccess.c1031
-rw-r--r--arch/s390/kvm/gaccess.h20
-rw-r--r--arch/s390/kvm/gmap-vsie.c142
-rw-r--r--arch/s390/kvm/gmap.c1342
-rw-r--r--arch/s390/kvm/gmap.h252
-rw-r--r--arch/s390/kvm/guestdbg.c8
-rw-r--r--arch/s390/kvm/intercept.c29
-rw-r--r--arch/s390/kvm/interrupt.c168
-rw-r--r--arch/s390/kvm/kvm-s390.c1329
-rw-r--r--arch/s390/kvm/kvm-s390.h75
-rw-r--r--arch/s390/kvm/pci.c9
-rw-r--r--arch/s390/kvm/priv.c237
-rw-r--r--arch/s390/kvm/pv.c246
-rw-r--r--arch/s390/kvm/trace-s390.h4
-rw-r--r--arch/s390/kvm/vsie.c255
22 files changed, 5792 insertions, 1931 deletions
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index cae908d64550..5b835bc6a194 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
def_tristate y
prompt "Kernel-based Virtual Machine (KVM) support"
select HAVE_KVM_CPU_RELAX_INTERCEPT
- select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
select KVM_COMMON
@@ -29,7 +28,8 @@ config KVM
select HAVE_KVM_INVALID_WAKEUPS
select HAVE_KVM_NO_POLL
select KVM_VFIO
- select MMU_NOTIFIER
+ select VIRT_XFER_TO_GUEST_WORK
+ select KVM_MMU_LOCKLESS_AGING
help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index f0ffe874adc2..dac9d53b23d8 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -8,7 +8,8 @@ include $(srctree)/virt/kvm/Makefile.kvm
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-y += dat.o gmap.o faultin.o
kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
new file mode 100644
index 000000000000..7b8d70fe406d
--- /dev/null
+++ b/arch/s390/kvm/dat.c
@@ -0,0 +1,1321 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest address space mapping code
+ *
+ * Copyright IBM Corp. 2007, 2020, 2024
+ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * David Hildenbrand <david@redhat.com>
+ * Janosch Frank <frankja@linux.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pagewalk.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/ksm.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/pgtable.h>
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/pgalloc.h>
+
+#include <asm/page-states.h>
+#include <asm/tlb.h>
+#include "dat.h"
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc)
+{
+ void *o;
+
+ for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) {
+ o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
+ if (!o)
+ return -ENOMEM;
+ mc->crsts[mc->n_crsts] = o;
+ }
+ for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) {
+ o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!o)
+ return -ENOMEM;
+ mc->pts[mc->n_pts] = o;
+ }
+ for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) {
+ o = kzalloc_obj(*mc->rmaps[0], GFP_KERNEL_ACCOUNT);
+ if (!o)
+ return -ENOMEM;
+ mc->rmaps[mc->n_rmaps] = o;
+ }
+ return 0;
+}
+
+static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc)
+{
+ struct page_table *res;
+
+ res = kvm_s390_mmu_cache_alloc_pt(mc);
+ if (res)
+ __arch_set_page_dat(res, 1);
+ return res;
+}
+
+static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc)
+{
+ struct crst_table *res;
+
+ res = kvm_s390_mmu_cache_alloc_crst(mc);
+ if (res)
+ __arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER);
+ return res;
+}
+
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init)
+{
+ struct page *page;
+ void *virt;
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
+ if (!page)
+ return NULL;
+ virt = page_to_virt(page);
+ __arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER);
+ crst_table_init(virt, init);
+ return virt;
+}
+
+void dat_free_level(struct crst_table *table, bool owns_ptes)
+{
+ unsigned int i;
+
+ for (i = 0; i < _CRST_ENTRIES; i++) {
+ if (table->crstes[i].h.fc || table->crstes[i].h.i)
+ continue;
+ if (!is_pmd(table->crstes[i]))
+ dat_free_level(dereference_crste(table->crstes[i]), owns_ptes);
+ else if (owns_ptes)
+ dat_free_pt(dereference_pmd(table->crstes[i].pmd));
+ }
+ dat_free_crst(table);
+}
+
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype)
+{
+ struct crst_table *table;
+ union crste crste;
+
+ while (asce->dt > newtype) {
+ table = dereference_asce(*asce);
+ crste = table->crstes[0];
+ if (crste.h.fc)
+ return 0;
+ if (!crste.h.i) {
+ asce->rsto = crste.h.fc0.to;
+ dat_free_crst(table);
+ } else {
+ crste.h.tt--;
+ crst_table_init((void *)table, crste.val);
+ }
+ asce->dt--;
+ }
+ while (asce->dt < newtype) {
+ crste = _crste_fc0(asce->rsto, asce->dt + 1);
+ table = dat_alloc_crst_noinit(mc);
+ if (!table)
+ return -ENOMEM;
+ crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val);
+ table->crstes[0] = crste;
+ asce->rsto = __pa(table) >> PAGE_SHIFT;
+ asce->dt++;
+ }
+ return 0;
+}
+
+/**
+ * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
+ * @crstep: Pointer to the CRST entry.
+ * @old: Expected old value.
+ * @new: Replacement entry.
+ * @gfn: The affected guest address.
+ * @asce: The asce of the address space.
+ *
+ * This function is needed to atomically exchange a CRSTE that potentially
+ * maps a prefix area, without having to invalidate it inbetween.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ *
+ * Return: %true if the exchange was successful.
+ */
+bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new,
+ gfn_t gfn, union asce asce)
+{
+ if (old.h.i)
+ return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
+ if (cpu_has_edat2())
+ return crdte_crste(crstep, old, new, gfn, asce);
+ return cspg_crste(crstep, old, new);
+}
+
+static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste)
+{
+ union skey nkey = { .acc = pgste.acc, .fp = pgste.fp };
+
+ page_set_storage_key(pte_origin(pte), nkey.skey, 0);
+}
+
+static void dat_move_storage_key(union pte old, union pte new)
+{
+ page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1);
+}
+
+static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste)
+{
+ union skey skey;
+
+ skey.skey = page_get_storage_key(pte_origin(pte));
+
+ pgste.acc = skey.acc;
+ pgste.fp = skey.fp;
+ pgste.gr |= skey.r;
+ pgste.gc |= skey.c;
+
+ return pgste;
+}
+
+union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn,
+ union asce asce, bool uses_skeys)
+{
+ union pte old = READ_ONCE(*ptep);
+
+ /* Updating only the software bits while holding the pgste lock. */
+ if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) {
+ WRITE_ONCE(ptep->swbyte, new.swbyte);
+ return pgste;
+ }
+
+ if (!old.h.i) {
+ unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0);
+
+ if (machine_has_tlb_guest())
+ __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL);
+ else
+ __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL);
+ }
+
+ if (uses_skeys) {
+ if (old.h.i && !new.h.i)
+ /* Invalid to valid: restore storage keys from PGSTE. */
+ dat_set_storage_key_from_pgste(new, pgste);
+ else if (!old.h.i && new.h.i)
+ /* Valid to invalid: save storage keys to PGSTE. */
+ pgste = dat_save_storage_key_into_pgste(old, pgste);
+ else if (!old.h.i && !new.h.i)
+ /* Valid to valid: move storage keys. */
+ if (old.h.pfra != new.h.pfra)
+ dat_move_storage_key(old, new);
+ /* Invalid to invalid: nothing to do. */
+ }
+
+ WRITE_ONCE(*ptep, new);
+ return pgste;
+}
+
+/*
+ * dat_split_ste() - Split a segment table entry into page table entries.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ *
+ * Return: 0 in case of success, -ENOMEM if running out of memory.
+ */
+static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn,
+ union asce asce, bool uses_skeys)
+{
+ union pgste pgste_init;
+ struct page_table *pt;
+ union pmd new, old;
+ union pte init;
+ int i;
+
+ BUG_ON(!mc);
+ old = READ_ONCE(*pmdp);
+
+ /* Already split, nothing to do. */
+ if (!old.h.i && !old.h.fc)
+ return 0;
+
+ pt = dat_alloc_pt_noinit(mc);
+ if (!pt)
+ return -ENOMEM;
+ new.val = virt_to_phys(pt);
+
+ while (old.h.i || old.h.fc) {
+ init.val = pmd_origin_large(old);
+ init.h.p = old.h.p;
+ init.h.i = old.h.i;
+ init.s.d = old.s.fc1.d;
+ init.s.w = old.s.fc1.w;
+ init.s.y = old.s.fc1.y;
+ init.s.sd = old.s.fc1.sd;
+ init.s.pr = old.s.fc1.pr;
+ pgste_init.val = 0;
+ if (old.h.fc) {
+ for (i = 0; i < _PAGE_ENTRIES; i++)
+ pt->ptes[i].val = init.val | i * PAGE_SIZE;
+ /* No need to take locks as the page table is not installed yet. */
+ pgste_init.prefix_notif = old.s.fc1.prefix_notif;
+ pgste_init.vsie_notif = old.s.fc1.vsie_notif;
+ pgste_init.pcl = uses_skeys && init.h.i;
+ dat_init_pgstes(pt, pgste_init.val);
+ } else {
+ dat_init_page_table(pt, init.val, 0);
+ }
+
+ if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) {
+ if (!pgste_init.pcl)
+ return 0;
+ for (i = 0; i < _PAGE_ENTRIES; i++) {
+ union pgste pgste = pt->pgstes[i];
+
+ pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste);
+ pgste_set_unlock(pt->ptes + i, pgste);
+ }
+ return 0;
+ }
+ old = READ_ONCE(*pmdp);
+ }
+
+ dat_free_pt(pt);
+ return 0;
+}
+
+/*
+ * dat_split_crste() - Split a crste into smaller crstes.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ *
+ * Return: %0 in case of success, %-ENOMEM if running out of memory.
+ */
+static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep,
+ gfn_t gfn, union asce asce, bool uses_skeys)
+{
+ struct crst_table *table;
+ union crste old, new, init;
+ int i;
+
+ old = READ_ONCE(*crstep);
+ if (is_pmd(old))
+ return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys);
+
+ BUG_ON(!mc);
+
+ /* Already split, nothing to do. */
+ if (!old.h.i && !old.h.fc)
+ return 0;
+
+ table = dat_alloc_crst_noinit(mc);
+ if (!table)
+ return -ENOMEM;
+
+ new.val = virt_to_phys(table);
+ new.h.tt = old.h.tt;
+ new.h.fc0.tl = _REGION_ENTRY_LENGTH;
+
+ while (old.h.i || old.h.fc) {
+ init = old;
+ init.h.tt--;
+ if (old.h.fc) {
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ table->crstes[i].val = init.val | i * HPAGE_SIZE;
+ } else {
+ crst_table_init((void *)table, init.val);
+ }
+ if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce))
+ return 0;
+ old = READ_ONCE(*crstep);
+ }
+
+ dat_free_crst(table);
+ return 0;
+}
+
+/**
+ * dat_entry_walk() - Walk the gmap page tables.
+ * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither
+ * %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags.
+ * @gfn: Guest frame.
+ * @asce: The ASCE of the address space.
+ * @flags: Flags from WALK_* macros.
+ * @walk_level: Level to walk to, from LEVEL_* macros.
+ * @last: Will be filled the last visited non-pte DAT entry.
+ * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL.
+ *
+ * Returns a table entry pointer for the given guest address and @walk_level.
+ *
+ * The @flags have the following meanings:
+ * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries
+ * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed
+ * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed
+ * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered
+ * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached
+ * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to
+ * continue walking to ptes with only DAT_WALK_ANY
+ * * %DAT_WALK_USES_SKEYS: storage keys are in use
+ *
+ * Context: called with kvm->mmu_lock held.
+ *
+ * Return:
+ * * %PGM_ADDRESSING if the requested address lies outside memory
+ * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC
+ * * %-EFAULT if the requested address lies inside a memory hole of a different type
+ * * %-EINVAL if the given ASCE is not compatible with the requested level
+ * * %-EFBIG if the requested level could not be reached because a larger frame was found
+ * * %-ENOENT if the requested level could not be reached for other reasons
+ * * %-ENOMEM if running out of memory while allocating or splitting a table
+ */
+int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags,
+ int walk_level, union crste **last, union pte **ptepp)
+{
+ union vaddress vaddr = { .addr = gfn_to_gpa(gfn) };
+ bool continue_anyway = flags & DAT_WALK_CONTINUE;
+ bool uses_skeys = flags & DAT_WALK_USES_SKEYS;
+ bool ign_holes = flags & DAT_WALK_IGN_HOLES;
+ bool allocate = flags & DAT_WALK_ALLOC;
+ bool split = flags & DAT_WALK_SPLIT;
+ bool leaf = flags & DAT_WALK_LEAF;
+ bool any = flags & DAT_WALK_ANY;
+ struct page_table *pgtable;
+ struct crst_table *table;
+ union crste entry;
+ int rc;
+
+ *last = NULL;
+ *ptepp = NULL;
+ if (WARN_ON_ONCE(unlikely(!asce.val)))
+ return -EINVAL;
+ if (WARN_ON_ONCE(unlikely(walk_level > asce.dt)))
+ return -EINVAL;
+ if (!asce_contains_gfn(asce, gfn))
+ return PGM_ADDRESSING;
+
+ table = dereference_asce(asce);
+ if (asce.dt >= ASCE_TYPE_REGION1) {
+ *last = table->crstes + vaddr.rfx;
+ entry = READ_ONCE(**last);
+ if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1))
+ return -EINVAL;
+ if (crste_hole(entry) && !ign_holes)
+ return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+ if (walk_level == TABLE_TYPE_REGION1)
+ return 0;
+ if (entry.pgd.h.i) {
+ if (!allocate)
+ return any ? 0 : -ENOENT;
+ rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+ if (rc)
+ return rc;
+ entry = READ_ONCE(**last);
+ }
+ table = dereference_crste(entry.pgd);
+ }
+
+ if (asce.dt >= ASCE_TYPE_REGION2) {
+ *last = table->crstes + vaddr.rsx;
+ entry = READ_ONCE(**last);
+ if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2))
+ return -EINVAL;
+ if (crste_hole(entry) && !ign_holes)
+ return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+ if (walk_level == TABLE_TYPE_REGION2)
+ return 0;
+ if (entry.p4d.h.i) {
+ if (!allocate)
+ return any ? 0 : -ENOENT;
+ rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+ if (rc)
+ return rc;
+ entry = READ_ONCE(**last);
+ }
+ table = dereference_crste(entry.p4d);
+ }
+
+ if (asce.dt >= ASCE_TYPE_REGION3) {
+ *last = table->crstes + vaddr.rtx;
+ entry = READ_ONCE(**last);
+ if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3))
+ return -EINVAL;
+ if (crste_hole(entry) && !ign_holes)
+ return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+ if (walk_level == TABLE_TYPE_REGION3 &&
+ continue_anyway && !entry.pud.h.fc && !entry.h.i) {
+ walk_level = TABLE_TYPE_PAGE_TABLE;
+ allocate = false;
+ }
+ if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc))
+ return 0;
+ if (entry.pud.h.i && !entry.pud.h.fc) {
+ if (!allocate)
+ return any ? 0 : -ENOENT;
+ rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+ if (rc)
+ return rc;
+ entry = READ_ONCE(**last);
+ }
+ if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) {
+ if (!split)
+ return -EFBIG;
+ rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+ if (rc)
+ return rc;
+ entry = READ_ONCE(**last);
+ }
+ table = dereference_crste(entry.pud);
+ }
+
+ *last = table->crstes + vaddr.sx;
+ entry = READ_ONCE(**last);
+ if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT))
+ return -EINVAL;
+ if (crste_hole(entry) && !ign_holes)
+ return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+ if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) {
+ walk_level = TABLE_TYPE_PAGE_TABLE;
+ allocate = false;
+ }
+ if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc))
+ return 0;
+
+ if (entry.pmd.h.i && !entry.pmd.h.fc) {
+ if (!allocate)
+ return any ? 0 : -ENOENT;
+ rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys);
+ if (rc)
+ return rc;
+ entry = READ_ONCE(**last);
+ }
+ if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) {
+ if (!split)
+ return -EFBIG;
+ rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys);
+ if (rc)
+ return rc;
+ entry = READ_ONCE(**last);
+ }
+ pgtable = dereference_pmd(entry.pmd);
+ *ptepp = pgtable->ptes + vaddr.px;
+ if (pte_hole(**ptepp) && !ign_holes)
+ return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT;
+ return 0;
+}
+
+static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w)
+{
+ unsigned int idx = gfn & (_PAGE_ENTRIES - 1);
+ long rc = 0;
+
+ for ( ; gfn < end; idx++, gfn++) {
+ if (pte_hole(READ_ONCE(table->ptes[idx]))) {
+ if (!(w->flags & DAT_WALK_IGN_HOLES))
+ return -EFAULT;
+ if (!(w->flags & DAT_WALK_ANY))
+ continue;
+ }
+
+ rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table,
+ struct dat_walk *walk)
+{
+ unsigned long idx, cur_shift, cur_size;
+ dat_walk_op the_op;
+ union crste crste;
+ gfn_t cur, next;
+ long rc = 0;
+
+ cur_shift = 8 + table->crstes[0].h.tt * 11;
+ idx = (start >> cur_shift) & (_CRST_ENTRIES - 1);
+ cur_size = 1UL << cur_shift;
+
+ for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) {
+ next = cur + cur_size;
+ walk->last = table->crstes + idx;
+ crste = READ_ONCE(*walk->last);
+
+ if (crste_hole(crste)) {
+ if (!(walk->flags & DAT_WALK_IGN_HOLES))
+ return -EFAULT;
+ if (!(walk->flags & DAT_WALK_ANY))
+ continue;
+ }
+
+ the_op = walk->ops->crste_ops[crste.h.tt];
+ if (the_op) {
+ rc = the_op(walk->last, cur, next, walk);
+ crste = READ_ONCE(*walk->last);
+ }
+ if (rc)
+ break;
+ if (!crste.h.i && !crste.h.fc) {
+ if (!is_pmd(crste))
+ rc = dat_crste_walk_range(max(start, cur), min(end, next),
+ _dereference_crste(crste), walk);
+ else if (walk->ops->pte_entry)
+ rc = dat_pte_walk_range(max(start, cur), min(end, next),
+ dereference_pmd(crste.pmd), walk);
+ }
+ }
+ return rc;
+}
+
+/**
+ * _dat_walk_gfn_range() - Walk DAT tables.
+ * @start: The first guest page frame to walk.
+ * @end: The guest page frame immediately after the last one to walk.
+ * @asce: The ASCE of the guest mapping.
+ * @ops: The gmap_walk_ops that will be used to perform the walk.
+ * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported).
+ * @priv: Will be passed as-is to the callbacks.
+ *
+ * Any callback returning non-zero causes the walk to stop immediately.
+ *
+ * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the
+ * given ASCE unless the DAT_WALK_IGN_HOLES flag is specified,
+ * otherwise it returns whatever the callbacks return.
+ */
+long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce,
+ const struct dat_walk_ops *ops, int flags, void *priv)
+{
+ struct crst_table *table = dereference_asce(asce);
+ struct dat_walk walk = {
+ .ops = ops,
+ .asce = asce,
+ .priv = priv,
+ .flags = flags,
+ .start = start,
+ .end = end,
+ };
+
+ if (WARN_ON_ONCE(unlikely(!asce.val)))
+ return -EINVAL;
+ if (!asce_contains_gfn(asce, start))
+ return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT;
+
+ return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk);
+}
+
+int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey)
+{
+ union crste *crstep;
+ union pgste pgste;
+ union pte *ptep;
+ int rc;
+
+ skey->skey = 0;
+ rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+ if (rc)
+ return rc;
+
+ if (!ptep) {
+ union crste crste;
+
+ crste = READ_ONCE(*crstep);
+ if (!crste.h.fc || !crste.s.fc1.pr)
+ return 0;
+ skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn));
+ return 0;
+ }
+ pgste = pgste_get_lock(ptep);
+ if (ptep->h.i) {
+ skey->acc = pgste.acc;
+ skey->fp = pgste.fp;
+ } else {
+ skey->skey = page_get_storage_key(pte_origin(*ptep));
+ }
+ skey->r |= pgste.gr;
+ skey->c |= pgste.gc;
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep)
+{
+ if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc)
+ __atomic64_or(_PAGE_SD, &ptep->val);
+}
+
+int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+ union skey skey, bool nq)
+{
+ union pgste pgste, old;
+ union crste *crstep;
+ union pte *ptep;
+ int rc;
+
+ rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE,
+ &crstep, &ptep);
+ if (rc)
+ return rc;
+
+ if (!ptep) {
+ page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq);
+ return 0;
+ }
+
+ old = pgste_get_lock(ptep);
+ pgste = old;
+
+ pgste.acc = skey.acc;
+ pgste.fp = skey.fp;
+ pgste.gc = skey.c;
+ pgste.gr = skey.r;
+
+ if (!ptep->h.i) {
+ union skey old_skey;
+
+ old_skey.skey = page_get_storage_key(pte_origin(*ptep));
+ pgste.hc |= old_skey.c;
+ pgste.hr |= old_skey.r;
+ old_skey.c = old.gc;
+ old_skey.r = old.gr;
+ skey.r = 0;
+ skey.c = 0;
+ page_set_storage_key(pte_origin(*ptep), skey.skey, !nq);
+ }
+
+ dat_update_ptep_sd(old, pgste, ptep);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey,
+ bool nq, bool mr, bool mc)
+{
+ oldkey->skey = page_get_storage_key(paddr);
+ if (oldkey->acc == skey.acc && oldkey->fp == skey.fp &&
+ (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc))
+ return false;
+ page_set_storage_key(paddr, skey.skey, !nq);
+ return true;
+}
+
+int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn,
+ union skey skey, union skey *oldkey, bool nq, bool mr, bool mc)
+{
+ union pgste pgste, old;
+ union crste *crstep;
+ union skey prev;
+ union pte *ptep;
+ int rc;
+
+ rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE,
+ &crstep, &ptep);
+ if (rc)
+ return rc;
+
+ if (!ptep)
+ return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey,
+ nq, mr, mc);
+
+ old = pgste_get_lock(ptep);
+ pgste = old;
+
+ rc = 1;
+ pgste.acc = skey.acc;
+ pgste.fp = skey.fp;
+ pgste.gc = skey.c;
+ pgste.gr = skey.r;
+
+ if (!ptep->h.i) {
+ rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc);
+ pgste.hc |= prev.c;
+ pgste.hr |= prev.r;
+ prev.c |= old.gc;
+ prev.r |= old.gr;
+ } else {
+ prev.acc = old.acc;
+ prev.fp = old.fp;
+ prev.c = old.gc;
+ prev.r = old.gr;
+ }
+ if (oldkey)
+ *oldkey = prev;
+
+ dat_update_ptep_sd(old, pgste, ptep);
+ pgste_set_unlock(ptep, pgste);
+ return rc;
+}
+
+int dat_reset_reference_bit(union asce asce, gfn_t gfn)
+{
+ union pgste pgste, old;
+ union crste *crstep;
+ union pte *ptep;
+ int rc;
+
+ rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+ if (rc)
+ return rc;
+
+ if (!ptep) {
+ union crste crste = READ_ONCE(*crstep);
+
+ if (!crste.h.fc || !crste.s.fc1.pr)
+ return 0;
+ return page_reset_referenced(large_crste_to_phys(*crstep, gfn));
+ }
+ old = pgste_get_lock(ptep);
+ pgste = old;
+
+ if (!ptep->h.i) {
+ rc = page_reset_referenced(pte_origin(*ptep));
+ pgste.hr = rc >> 1;
+ }
+ rc |= (pgste.gr << 1) | pgste.gc;
+ pgste.gr = 0;
+
+ dat_update_ptep_sd(old, pgste, ptep);
+ pgste_set_unlock(ptep, pgste);
+ return rc;
+}
+
+static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ pgste.acc = 0;
+ pgste.fp = 0;
+ pgste.gr = 0;
+ pgste.gc = 0;
+ if (ptep->s.pr)
+ page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1);
+ pgste_set_unlock(ptep, pgste);
+
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ phys_addr_t addr, end, origin = crste_origin_large(*crstep);
+
+ if (!crstep->h.fc || !crstep->s.fc1.pr)
+ return 0;
+
+ addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
+ end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
+ while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end)
+ addr = sske_frame(addr, PAGE_DEFAULT_KEY);
+ for ( ; addr < end; addr += PAGE_SIZE)
+ page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1);
+
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+long dat_reset_skeys(union asce asce, gfn_t start)
+{
+ const struct dat_walk_ops ops = {
+ .pte_entry = dat_reset_skeys_pte,
+ .pmd_entry = dat_reset_skeys_crste,
+ .pud_entry = dat_reset_skeys_crste,
+ };
+
+ return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL);
+}
+
+struct slot_priv {
+ unsigned long token;
+ struct kvm_s390_mmu_cache *mc;
+};
+
+static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct slot_priv *p = walk->priv;
+ union crste dummy = { .val = p->token };
+ union pte new_pte, pte = READ_ONCE(*ptep);
+
+ new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par);
+
+ /* Table entry already in the desired state. */
+ if (pte.val == new_pte.val)
+ return 0;
+
+ dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false);
+ return 0;
+}
+
+static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union crste new_crste, crste = READ_ONCE(*crstep);
+ struct slot_priv *p = walk->priv;
+
+ new_crste.val = p->token;
+ new_crste.h.tt = crste.h.tt;
+
+ /* Table entry already in the desired state. */
+ if (crste.val == new_crste.val)
+ return 0;
+
+ /* This table entry needs to be updated. */
+ if (walk->start <= gfn && walk->end >= next) {
+ if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce))
+ return -EINVAL;
+ /* A lower level table was present, needs to be freed. */
+ if (!crste.h.fc && !crste.h.i) {
+ if (is_pmd(crste))
+ dat_free_pt(dereference_pmd(crste.pmd));
+ else
+ dat_free_level(dereference_crste(crste), true);
+ }
+ return 0;
+ }
+
+ /* A lower level table is present, things will handled there. */
+ if (!crste.h.fc && !crste.h.i)
+ return 0;
+ /* Split (install a lower level table), and handle things there. */
+ return dat_split_crste(p->mc, crstep, gfn, walk->asce, false);
+}
+
+static const struct dat_walk_ops dat_slot_ops = {
+ .pte_entry = _dat_slot_pte,
+ .crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, },
+};
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+ u16 type, u16 param)
+{
+ struct slot_priv priv = {
+ .token = _CRSTE_TOK(0, type, param).val,
+ .mc = mc,
+ };
+
+ return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops,
+ DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv);
+}
+
+static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ if (!pgstes[i].pcl)
+ break;
+ pgste_set_unlock(first + i, pgstes[i]);
+ }
+}
+
+static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ if (!pgste_get_trylock(first + i, pgstes + i))
+ break;
+ }
+ if (i == n)
+ return true;
+ pgste_set_unlock_multiple(first, n, pgstes);
+ return false;
+}
+
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param)
+{
+ union pgste pgstes[4] = {};
+ unsigned long res = 0;
+ int i, n;
+
+ n = param.len + 1;
+
+ while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
+ cpu_relax();
+
+ for (i = 0; i < n; i++)
+ res = res << 16 | pgstes[i].val16;
+
+ pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
+ return res;
+}
+
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val)
+{
+ union pgste pgstes[4] = {};
+ int i, n;
+
+ n = param.len + 1;
+
+ while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
+ cpu_relax();
+
+ for (i = param.len; i >= 0; i--) {
+ pgstes[i].val16 = val;
+ val = val >> 16;
+ }
+
+ pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
+}
+
+static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk)
+{
+ return ptep->s.y;
+}
+
+static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end,
+ struct dat_walk *walk)
+{
+ return crstep->h.fc && crstep->s.fc1.y;
+}
+
+static const struct dat_walk_ops test_age_ops = {
+ .pte_entry = _dat_test_young_pte,
+ .pmd_entry = _dat_test_young_crste,
+ .pud_entry = _dat_test_young_crste,
+};
+
+/**
+ * dat_test_age_gfn() - Test young.
+ * @asce: The ASCE whose address range is to be tested.
+ * @start: The first guest frame of the range to check.
+ * @end: The guest frame after the last in the range.
+ *
+ * Context: called by KVM common code with the kvm mmu write lock held.
+ *
+ * Return: %true if any page in the given range is young, otherwise %false.
+ */
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
+{
+ return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
+}
+
+static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union crste newcrste, oldcrste;
+ int *n = walk->priv;
+
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p)
+ return 0;
+ if (oldcrste.s.fc1.prefix_notif)
+ break;
+ newcrste = oldcrste;
+ newcrste.s.fc1.prefix_notif = 1;
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce));
+ *n = 2;
+ return 0;
+}
+
+static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ int *n = walk->priv;
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ if (!ptep->h.i && !ptep->h.p) {
+ pgste.prefix_notif = 1;
+ *n += 1;
+ }
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn)
+{
+ static const struct dat_walk_ops ops = {
+ .pte_entry = dat_set_pn_pte,
+ .pmd_entry = dat_set_pn_crste,
+ .pud_entry = dat_set_pn_crste,
+ };
+
+ int n = 0;
+
+ _dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n);
+ if (n != 2)
+ return -EAGAIN;
+ return 0;
+}
+
+/**
+ * dat_perform_essa() - Perform ESSA actions on the PGSTE.
+ * @asce: The asce to operate on.
+ * @gfn: The guest page frame to operate on.
+ * @orc: The specific action to perform, see the ESSA_SET_* macros.
+ * @state: The storage attributes to be returned to the guest.
+ * @dirty: Returns whether the function dirtied a previously clean entry.
+ *
+ * Context: Called with kvm->mmu_lock held.
+ *
+ * Return:
+ * * %1 if the page state has been altered and the page is to be added to the CBRL
+ * * %0 if the page state has been altered, but the page is not to be added to the CBRL
+ * * %-1 if the page state has not been altered and the page is not to be added to the CBRL
+ */
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty)
+{
+ union crste *crstep;
+ union pgste pgste;
+ union pte *ptep;
+ int res = 0;
+
+ if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) {
+ *state = (union essa_state) { .exception = 1 };
+ return -1;
+ }
+
+ pgste = pgste_get_lock(ptep);
+
+ *state = (union essa_state) {
+ .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero),
+ .nodat = pgste.nodat,
+ .usage = pgste.usage,
+ };
+
+ switch (orc) {
+ case ESSA_GET_STATE:
+ res = -1;
+ break;
+ case ESSA_SET_STABLE:
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ pgste.nodat = 0;
+ break;
+ case ESSA_SET_UNUSED:
+ pgste.usage = PGSTE_GPS_USAGE_UNUSED;
+ if (ptep->h.i)
+ res = 1;
+ break;
+ case ESSA_SET_VOLATILE:
+ pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+ if (ptep->h.i)
+ res = 1;
+ break;
+ case ESSA_SET_POT_VOLATILE:
+ if (!ptep->h.i) {
+ pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE;
+ } else if (pgste.zero) {
+ pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+ } else if (!pgste.gc) {
+ pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+ res = 1;
+ }
+ break;
+ case ESSA_SET_STABLE_RESIDENT:
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ /*
+ * Since the resident state can go away any time after this
+ * call, we will not make this page resident. We can revisit
+ * this decision if a guest will ever start using this.
+ */
+ break;
+ case ESSA_SET_STABLE_IF_RESIDENT:
+ if (!ptep->h.i)
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ break;
+ case ESSA_SET_STABLE_NODAT:
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ pgste.nodat = 1;
+ break;
+ default:
+ WARN_ONCE(1, "Invalid ORC!");
+ res = -1;
+ break;
+ }
+ /* If we are discarding a page, set it to logical zero. */
+ pgste.zero = res == 1;
+ if (orc > 0) {
+ *dirty = !pgste.cmma_d;
+ pgste.cmma_d = 1;
+ }
+
+ pgste_set_unlock(ptep, pgste);
+
+ return res;
+}
+
+static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ pgste.usage = 0;
+ pgste.nodat = 0;
+ pgste.cmma_d = 0;
+ pgste_set_unlock(ptep, pgste);
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+long dat_reset_cmma(union asce asce, gfn_t start)
+{
+ const struct dat_walk_ops dat_reset_cmma_ops = {
+ .pte_entry = dat_reset_cmma_pte,
+ };
+
+ return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops,
+ DAT_WALK_IGN_HOLES, NULL);
+}
+
+struct dat_get_cmma_state {
+ gfn_t start;
+ gfn_t end;
+ unsigned int count;
+ u8 *values;
+ atomic64_t *remaining;
+};
+
+static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_get_cmma_state *state = walk->priv;
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6);
+ pgste_set_unlock(ptep, pgste);
+ state->end = next;
+
+ return 0;
+}
+
+static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_get_cmma_state *state = walk->priv;
+
+ if (crstep->h.i)
+ state->end = min(walk->end, next);
+ return 0;
+}
+
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values)
+{
+ const struct dat_walk_ops ops = {
+ .pte_entry = __dat_peek_cmma_pte,
+ .pmd_entry = __dat_peek_cmma_crste,
+ .pud_entry = __dat_peek_cmma_crste,
+ .p4d_entry = __dat_peek_cmma_crste,
+ .pgd_entry = __dat_peek_cmma_crste,
+ };
+ struct dat_get_cmma_state state = { .values = values, };
+ int rc;
+
+ rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state);
+ *count = state.end - start;
+ /* Return success if at least one value was saved, otherwise an error. */
+ return (rc == -EFAULT && *count > 0) ? 0 : rc;
+}
+
+static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_get_cmma_state *state = walk->priv;
+ union pgste pgste;
+
+ if (state->start != -1) {
+ if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE)
+ return 1;
+ if (gfn - state->start >= state->count)
+ return 1;
+ }
+
+ if (!READ_ONCE(*pgste_of(ptep)).cmma_d)
+ return 0;
+
+ pgste = pgste_get_lock(ptep);
+ if (pgste.cmma_d) {
+ if (state->start == -1)
+ state->start = gfn;
+ pgste.cmma_d = 0;
+ atomic64_dec(state->remaining);
+ state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6;
+ state->end = next;
+ }
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem)
+{
+ const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, };
+ struct dat_get_cmma_state state = {
+ .remaining = rem,
+ .values = values,
+ .count = *count,
+ .start = -1,
+ };
+
+ _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state);
+
+ if (state.start == -1) {
+ *count = 0;
+ } else {
+ *count = state.end - state.start;
+ *start = state.start;
+ }
+
+ return 0;
+}
+
+struct dat_set_cmma_state {
+ unsigned long mask;
+ const u8 *bits;
+};
+
+static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_set_cmma_state *state = walk->priv;
+ union pgste pgste, tmp;
+
+ tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask;
+
+ pgste = pgste_get_lock(ptep);
+ pgste.usage = tmp.usage;
+ pgste.nodat = tmp.nodat;
+ pgste_set_unlock(ptep, pgste);
+
+ return 0;
+}
+
+/**
+ * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages.
+ * @mc: Cache used for allocations.
+ * @asce: The ASCE of the guest.
+ * @gfn: The guest frame of the fist page whose CMMA bits are to set.
+ * @count: How many pages need to be processed.
+ * @mask: Which PGSTE bits should be set.
+ * @bits: Points to an array with the CMMA attributes.
+ *
+ * This function sets the CMMA attributes for the given pages. If the input
+ * buffer has zero length, no action is taken, otherwise the attributes are
+ * set and the mm->context.uses_cmm flag is set.
+ *
+ * Each byte in @bits contains new values for bits 32-39 of the PGSTE.
+ * Currently, only the fields NT and US are applied.
+ *
+ * Return: %0 in case of success, a negative error value otherwise.
+ */
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+ unsigned long count, unsigned long mask, const uint8_t *bits)
+{
+ const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, };
+ struct dat_set_cmma_state state = { .mask = mask, .bits = bits, };
+ union crste *crstep;
+ union pte *ptep;
+ gfn_t cur;
+ int rc;
+
+ for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) {
+ rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE,
+ &crstep, &ptep);
+ if (rc)
+ return rc;
+ }
+ return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state);
+}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
new file mode 100644
index 000000000000..8f8278c44879
--- /dev/null
+++ b/arch/s390/kvm/dat.h
@@ -0,0 +1,976 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KVM guest address space mapping code
+ *
+ * Copyright IBM Corp. 2024, 2025
+ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_DAT_H
+#define __KVM_S390_DAT_H
+
+#include <linux/radix-tree.h>
+#include <linux/refcount.h>
+#include <linux/io.h>
+#include <linux/kvm_types.h>
+#include <linux/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/dat-bits.h>
+
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* For consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
+#define _ASCE(x) ((union asce) { .val = (x), })
+#define NULL_ASCE _ASCE(0)
+
+enum {
+ _DAT_TOKEN_NONE = 0,
+ _DAT_TOKEN_PIC,
+};
+
+#define _CRSTE_TOK(l, t, p) ((union crste) { \
+ .tok.i = 1, \
+ .tok.tt = (l), \
+ .tok.type = (t), \
+ .tok.par = (p) \
+ })
+#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p)
+
+#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING)
+#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0)
+
+#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT)
+
+#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) })
+#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0)
+
+/* This fake table type is used for page table walks (both for normal page tables and vSIE) */
+#define TABLE_TYPE_PAGE_TABLE -1
+
+enum dat_walk_flags {
+ DAT_WALK_USES_SKEYS = 0x40,
+ DAT_WALK_CONTINUE = 0x20,
+ DAT_WALK_IGN_HOLES = 0x10,
+ DAT_WALK_SPLIT = 0x08,
+ DAT_WALK_ALLOC = 0x04,
+ DAT_WALK_ANY = 0x02,
+ DAT_WALK_LEAF = 0x01,
+ DAT_WALK_DEFAULT = 0
+};
+
+#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC)
+#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC)
+#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC)
+
+union pte {
+ unsigned long val;
+ union page_table_entry h;
+ struct {
+ unsigned long :56; /* Hardware bits */
+ unsigned long u : 1; /* Page unused */
+ unsigned long s : 1; /* Special */
+ unsigned long w : 1; /* Writable */
+ unsigned long r : 1; /* Readable */
+ unsigned long d : 1; /* Dirty */
+ unsigned long y : 1; /* Young */
+ unsigned long sd: 1; /* Soft dirty */
+ unsigned long pr: 1; /* Present */
+ } s;
+ struct {
+ unsigned char hwbytes[7];
+ unsigned char swbyte;
+ };
+ union {
+ struct {
+ unsigned long type :16; /* Token type */
+ unsigned long par :16; /* Token parameter */
+ unsigned long :20;
+ unsigned long : 1; /* Must be 0 */
+ unsigned long i : 1; /* Must be 1 */
+ unsigned long : 2;
+ unsigned long : 7;
+ unsigned long pr : 1; /* Must be 0 */
+ };
+ struct {
+ unsigned long token:32; /* Token and parameter */
+ unsigned long :32;
+ };
+ } tok;
+};
+
+#define _SEGMENT_FR_MASK (_SEGMENT_MASK >> PAGE_SHIFT)
+#define _REGION3_FR_MASK (_REGION3_MASK >> PAGE_SHIFT)
+#define _PAGES_PER_SEGMENT _PAGE_ENTRIES
+#define _PAGES_PER_REGION3 (_PAGES_PER_SEGMENT * _CRST_ENTRIES)
+
+/* Soft dirty, needed as macro for atomic operations on ptes */
+#define _PAGE_SD 0x002
+
+/* Needed as macro to perform atomic operations */
+#define PGSTE_PCL_BIT 0x0080000000000000UL /* PCL lock, HW bit */
+#define PGSTE_CMMA_D_BIT 0x0000000000008000UL /* CMMA dirty soft-bit */
+
+enum pgste_gps_usage {
+ PGSTE_GPS_USAGE_STABLE = 0,
+ PGSTE_GPS_USAGE_UNUSED,
+ PGSTE_GPS_USAGE_POT_VOLATILE,
+ PGSTE_GPS_USAGE_VOLATILE,
+};
+
+union pgste {
+ unsigned long val;
+ struct {
+ unsigned long acc : 4;
+ unsigned long fp : 1;
+ unsigned long : 3;
+ unsigned long pcl : 1;
+ unsigned long hr : 1;
+ unsigned long hc : 1;
+ unsigned long : 2;
+ unsigned long gr : 1;
+ unsigned long gc : 1;
+ unsigned long : 1;
+ unsigned long :16; /* val16 */
+ unsigned long zero : 1;
+ unsigned long nodat : 1;
+ unsigned long : 4;
+ unsigned long usage : 2;
+ unsigned long : 8;
+ unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+ unsigned long vsie_notif : 1; /* Referenced in a shadow table */
+ unsigned long : 5;
+ unsigned long : 8;
+ };
+ struct {
+ unsigned short hwbytes0;
+ unsigned short val16; /* Used to store chunked values, see dat_{s,g}et_ptval() */
+ unsigned short hwbytes4;
+ unsigned char flags; /* Maps to the software bits */
+ unsigned char hwbyte7;
+ } __packed;
+};
+
+union pmd {
+ unsigned long val;
+ union segment_table_entry h;
+ struct {
+ struct {
+ unsigned long :44; /* HW */
+ unsigned long : 3; /* Unused */
+ unsigned long : 1; /* HW */
+ unsigned long s : 1; /* Special */
+ unsigned long w : 1; /* Writable soft-bit */
+ unsigned long r : 1; /* Readable soft-bit */
+ unsigned long d : 1; /* Dirty */
+ unsigned long y : 1; /* Young */
+ unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+ unsigned long vsie_notif : 1; /* Referenced in a shadow table */
+ unsigned long : 4; /* HW */
+ unsigned long sd : 1; /* Soft-Dirty */
+ unsigned long pr : 1; /* Present */
+ } fc1;
+ } s;
+};
+
+union pud {
+ unsigned long val;
+ union region3_table_entry h;
+ struct {
+ struct {
+ unsigned long :33; /* HW */
+ unsigned long :14; /* Unused */
+ unsigned long : 1; /* HW */
+ unsigned long s : 1; /* Special */
+ unsigned long w : 1; /* Writable soft-bit */
+ unsigned long r : 1; /* Readable soft-bit */
+ unsigned long d : 1; /* Dirty */
+ unsigned long y : 1; /* Young */
+ unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+ unsigned long vsie_notif : 1; /* Referenced in a shadow table */
+ unsigned long : 4; /* HW */
+ unsigned long sd : 1; /* Soft-Dirty */
+ unsigned long pr : 1; /* Present */
+ } fc1;
+ } s;
+};
+
+union p4d {
+ unsigned long val;
+ union region2_table_entry h;
+};
+
+union pgd {
+ unsigned long val;
+ union region1_table_entry h;
+};
+
+union crste {
+ unsigned long val;
+ union {
+ struct {
+ unsigned long :52;
+ unsigned long : 1;
+ unsigned long fc: 1;
+ unsigned long p : 1;
+ unsigned long : 1;
+ unsigned long : 2;
+ unsigned long i : 1;
+ unsigned long : 1;
+ unsigned long tt: 2;
+ unsigned long : 2;
+ };
+ struct {
+ unsigned long to:52;
+ unsigned long : 1;
+ unsigned long fc: 1;
+ unsigned long p : 1;
+ unsigned long : 1;
+ unsigned long tf: 2;
+ unsigned long i : 1;
+ unsigned long : 1;
+ unsigned long tt: 2;
+ unsigned long tl: 2;
+ } fc0;
+ struct {
+ unsigned long :47;
+ unsigned long av : 1; /* ACCF-Validity Control */
+ unsigned long acc: 4; /* Access-Control Bits */
+ unsigned long f : 1; /* Fetch-Protection Bit */
+ unsigned long fc : 1; /* Format-Control */
+ unsigned long p : 1; /* DAT-Protection Bit */
+ unsigned long iep: 1; /* Instruction-Execution-Protection */
+ unsigned long : 2;
+ unsigned long i : 1; /* Segment-Invalid Bit */
+ unsigned long cs : 1; /* Common-Segment Bit */
+ unsigned long tt : 2; /* Table-Type Bits */
+ unsigned long : 2;
+ } fc1;
+ } h;
+ struct {
+ struct {
+ unsigned long :47;
+ unsigned long : 1; /* HW (should be 0) */
+ unsigned long s : 1; /* Special */
+ unsigned long w : 1; /* Writable */
+ unsigned long r : 1; /* Readable */
+ unsigned long d : 1; /* Dirty */
+ unsigned long y : 1; /* Young */
+ unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+ unsigned long vsie_notif : 1; /* Referenced in a shadow table */
+ unsigned long : 4; /* HW */
+ unsigned long sd : 1; /* Soft-Dirty */
+ unsigned long pr : 1; /* Present */
+ } fc1;
+ } s;
+ union {
+ struct {
+ unsigned long type :16; /* Token type */
+ unsigned long par :16; /* Token parameter */
+ unsigned long :26;
+ unsigned long i : 1; /* Must be 1 */
+ unsigned long : 1;
+ unsigned long tt : 2;
+ unsigned long : 1;
+ unsigned long pr : 1; /* Must be 0 */
+ };
+ struct {
+ unsigned long token:32; /* Token and parameter */
+ unsigned long :32;
+ };
+ } tok;
+ union pmd pmd;
+ union pud pud;
+ union p4d p4d;
+ union pgd pgd;
+};
+
+union skey {
+ unsigned char skey;
+ struct {
+ unsigned char acc :4;
+ unsigned char fp :1;
+ unsigned char r :1;
+ unsigned char c :1;
+ unsigned char zero:1;
+ };
+};
+
+static_assert(sizeof(union pgste) == sizeof(unsigned long));
+static_assert(sizeof(union pte) == sizeof(unsigned long));
+static_assert(sizeof(union pmd) == sizeof(unsigned long));
+static_assert(sizeof(union pud) == sizeof(unsigned long));
+static_assert(sizeof(union p4d) == sizeof(unsigned long));
+static_assert(sizeof(union pgd) == sizeof(unsigned long));
+static_assert(sizeof(union crste) == sizeof(unsigned long));
+static_assert(sizeof(union skey) == sizeof(char));
+
+struct segment_table {
+ union pmd pmds[_CRST_ENTRIES];
+};
+
+struct region3_table {
+ union pud puds[_CRST_ENTRIES];
+};
+
+struct region2_table {
+ union p4d p4ds[_CRST_ENTRIES];
+};
+
+struct region1_table {
+ union pgd pgds[_CRST_ENTRIES];
+};
+
+struct crst_table {
+ union {
+ union crste crstes[_CRST_ENTRIES];
+ struct segment_table segment;
+ struct region3_table region3;
+ struct region2_table region2;
+ struct region1_table region1;
+ };
+};
+
+struct page_table {
+ union pte ptes[_PAGE_ENTRIES];
+ union pgste pgstes[_PAGE_ENTRIES];
+};
+
+static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE);
+static_assert(sizeof(struct page_table) == PAGE_SIZE);
+
+struct dat_walk;
+
+typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w);
+
+struct dat_walk_ops {
+ union {
+ dat_walk_op crste_ops[4];
+ struct {
+ dat_walk_op pmd_entry;
+ dat_walk_op pud_entry;
+ dat_walk_op p4d_entry;
+ dat_walk_op pgd_entry;
+ };
+ };
+ long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w);
+};
+
+struct dat_walk {
+ const struct dat_walk_ops *ops;
+ union crste *last;
+ union pte *last_pte;
+ union asce asce;
+ gfn_t start;
+ gfn_t end;
+ int flags;
+ void *priv;
+};
+
+struct ptval_param {
+ unsigned char offset : 6;
+ unsigned char len : 2;
+};
+
+/**
+ * _pte() - Useful constructor for union pte
+ * @pfn: the pfn this pte should point to.
+ * @writable: whether the pte should be writable.
+ * @dirty: whether the pte should be dirty.
+ * @special: whether the pte should be marked as special
+ *
+ * The pte is also marked as young and present. If the pte is marked as dirty,
+ * it gets marked as soft-dirty too. If the pte is not dirty, the hardware
+ * protect bit is set (independently of the write softbit); this way proper
+ * dirty tracking can be performed.
+ *
+ * Return: a union pte value.
+ */
+static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special)
+{
+ union pte res = { .val = PFN_PHYS(pfn) };
+
+ res.h.p = !dirty;
+ res.s.y = 1;
+ res.s.pr = 1;
+ res.s.w = writable;
+ res.s.d = dirty;
+ res.s.sd = dirty;
+ res.s.s = special;
+ return res;
+}
+
+static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt)
+{
+ union crste res = { .val = PFN_PHYS(pfn) };
+
+ res.h.tt = tt;
+ res.h.fc0.tl = _REGION_ENTRY_LENGTH;
+ res.h.fc0.tf = 0;
+ return res;
+}
+
+/**
+ * _crste() - Useful constructor for union crste with FC=1
+ * @pfn: the pfn this pte should point to.
+ * @tt: the table type
+ * @writable: whether the pte should be writable.
+ * @dirty: whether the pte should be dirty.
+ *
+ * The crste is also marked as young and present. If the crste is marked as
+ * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the
+ * hardware protect bit is set (independently of the write softbit); this way
+ * proper dirty tracking can be performed.
+ *
+ * Return: a union crste value.
+ */
+static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty)
+{
+ union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK };
+
+ res.h.tt = tt;
+ res.h.p = !dirty;
+ res.h.fc = 1;
+ res.s.fc1.y = 1;
+ res.s.fc1.pr = 1;
+ res.s.fc1.w = writable;
+ res.s.fc1.d = dirty;
+ res.s.fc1.sd = dirty;
+ return res;
+}
+
+union essa_state {
+ unsigned char val;
+ struct {
+ unsigned char : 2;
+ unsigned char nodat : 1;
+ unsigned char exception : 1;
+ unsigned char usage : 2;
+ unsigned char content : 2;
+ };
+};
+
+/**
+ * struct vsie_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @r_gfn: virtual rmap address in the shadow guest address space
+ */
+struct vsie_rmap {
+ struct vsie_rmap *next;
+ union {
+ unsigned long val;
+ struct {
+ long level: 8;
+ unsigned long : 4;
+ unsigned long r_gfn:52;
+ };
+ };
+};
+
+static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long));
+
+#define KVM_S390_MMU_CACHE_N_CRSTS 6
+#define KVM_S390_MMU_CACHE_N_PTS 2
+#define KVM_S390_MMU_CACHE_N_RMAPS 16
+struct kvm_s390_mmu_cache {
+ void *crsts[KVM_S390_MMU_CACHE_N_CRSTS];
+ void *pts[KVM_S390_MMU_CACHE_N_PTS];
+ void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS];
+ short int n_crsts;
+ short int n_pts;
+ short int n_rmaps;
+};
+
+struct guest_fault {
+ gfn_t gfn; /* Guest frame */
+ kvm_pfn_t pfn; /* Host PFN */
+ struct page *page; /* Host page */
+ union pte *ptep; /* Used to resolve the fault, or NULL */
+ union crste *crstep; /* Used to resolve the fault, or NULL */
+ bool writable; /* Mapping is writable */
+ bool write_attempt; /* Write access attempted */
+ bool attempt_pfault; /* Attempt a pfault first */
+ bool valid; /* This entry contains valid data */
+ void (*callback)(struct guest_fault *f);
+ void *priv;
+};
+
+/*
+ * 0 1 2 3 4 5 6 7
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | | PGT_ADDR |
+ * 8 | VMADDR | |
+ * 16 | |
+ * 24 | |
+ */
+#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1})
+#define PTVAL_PGT_ADDR MKPTVAL(4, 8)
+#define PTVAL_VMADDR MKPTVAL(8, 6)
+
+union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new,
+ gfn_t gfn, union asce asce, bool uses_skeys);
+bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+ union asce asce);
+void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce);
+
+long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce,
+ const struct dat_walk_ops *ops, int flags, void *priv);
+
+int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags,
+ int walk_level, union crste **last, union pte **ptepp);
+void dat_free_level(struct crst_table *table, bool owns_ptes);
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init);
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype);
+int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey);
+int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+ union skey skey, bool nq);
+int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn,
+ union skey skey, union skey *oldkey, bool nq, bool mr, bool mc);
+int dat_reset_reference_bit(union asce asce, gfn_t gfn);
+long dat_reset_skeys(union asce asce, gfn_t start);
+
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param);
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val);
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+ u16 type, u16 param);
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
+
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
+long dat_reset_cmma(union asce asce, gfn_t start_gfn);
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values);
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem);
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+ unsigned long count, unsigned long mask, const uint8_t *bits);
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc);
+
+#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN)
+
+static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc)
+{
+ if (mc->n_pts)
+ return mc->pts[--mc->n_pts];
+ return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc)
+{
+ if (mc->n_crsts)
+ return mc->crsts[--mc->n_crsts];
+ return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER);
+}
+
+static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc)
+{
+ if (mc->n_rmaps)
+ return mc->rmaps[--mc->n_rmaps];
+ return kzalloc_obj(struct vsie_rmap, GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *crste_table_start(union crste *crstep)
+{
+ return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE);
+}
+
+static inline struct page_table *pte_table_start(union pte *ptep)
+{
+ return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE);
+}
+
+static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+ union asce asce)
+{
+ unsigned long dtt = 0x10 | new.h.tt << 2;
+ void *table = crste_table_start(crstep);
+
+ return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val);
+}
+
+/**
+ * idte_crste() - invalidate a crste entry using idte
+ * @crstep: pointer to the crste to be invalidated
+ * @gfn: a gfn mapped by the crste
+ * @opt: options for the idte instruction
+ * @asce: the asce
+ * @local: whether the operation is cpu-local
+ */
+static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt,
+ union asce asce, int local)
+{
+ unsigned long table_origin = __pa(crste_table_start(crstep));
+ unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK;
+
+ if (__builtin_constant_p(opt) && opt == 0) {
+ /* flush without guest asce */
+ asm volatile("idte %[table_origin],0,%[gaddr],%[local]"
+ : "+m" (*crstep)
+ : [table_origin] "a" (table_origin), [gaddr] "a" (gaddr),
+ [local] "i" (local)
+ : "cc");
+ } else {
+ /* flush with guest asce */
+ asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]"
+ : "+m" (*crstep)
+ : [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt),
+ [asce] "a" (asce.val), [local] "i" (local)
+ : "cc");
+ }
+}
+
+static inline void dat_init_pgstes(struct page_table *pt, unsigned long val)
+{
+ memset64((void *)pt->pgstes, val, PTRS_PER_PTE);
+}
+
+static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes,
+ unsigned long pgstes)
+{
+ memset64((void *)pt->ptes, ptes, PTRS_PER_PTE);
+ dat_init_pgstes(pt, pgstes);
+}
+
+static inline gfn_t asce_end(union asce asce)
+{
+ return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT);
+}
+
+#define _CRSTE(x) ((union crste) { .val = _Generic((x), \
+ union pgd : (x).val, \
+ union p4d : (x).val, \
+ union pud : (x).val, \
+ union pmd : (x).val, \
+ union crste : (x).val)})
+
+#define _CRSTEP(x) ((union crste *)_Generic((*(x)), \
+ union pgd : (x), \
+ union p4d : (x), \
+ union pud : (x), \
+ union pmd : (x), \
+ union crste : (x)))
+
+#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)), \
+ struct crst_table : (x), \
+ struct segment_table : (x), \
+ struct region3_table : (x), \
+ struct region2_table : (x), \
+ struct region1_table : (x)))
+
+static inline bool asce_contains_gfn(union asce asce, gfn_t gfn)
+{
+ return gfn < asce_end(asce);
+}
+
+static inline bool is_pmd(union crste crste)
+{
+ return crste.h.tt == TABLE_TYPE_SEGMENT;
+}
+
+static inline bool is_pud(union crste crste)
+{
+ return crste.h.tt == TABLE_TYPE_REGION3;
+}
+
+static inline bool is_p4d(union crste crste)
+{
+ return crste.h.tt == TABLE_TYPE_REGION2;
+}
+
+static inline bool is_pgd(union crste crste)
+{
+ return crste.h.tt == TABLE_TYPE_REGION1;
+}
+
+static inline phys_addr_t pmd_origin_large(union pmd pmd)
+{
+ return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE;
+}
+
+static inline phys_addr_t pud_origin_large(union pud pud)
+{
+ return pud.val & _REGION3_ENTRY_ORIGIN_LARGE;
+}
+
+/**
+ * crste_origin_large() - Return the large frame origin of a large crste
+ * @crste: The crste whose origin is to be returned. Should be either a
+ * region-3 table entry or a segment table entry, in both cases with
+ * FC set to 1 (large pages).
+ *
+ * Return: The origin of the large frame pointed to by @crste, or -1 if the
+ * crste was not large (wrong table type, or FC==0)
+ */
+static inline phys_addr_t crste_origin_large(union crste crste)
+{
+ if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3))
+ return -1;
+ if (is_pmd(crste))
+ return pmd_origin_large(crste.pmd);
+ return pud_origin_large(crste.pud);
+}
+
+#define crste_origin(x) (_Generic((x), \
+ union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN, \
+ union pud : (x).val & _REGION_ENTRY_ORIGIN, \
+ union p4d : (x).val & _REGION_ENTRY_ORIGIN, \
+ union pgd : (x).val & _REGION_ENTRY_ORIGIN))
+
+static inline unsigned long pte_origin(union pte pte)
+{
+ return pte.val & PAGE_MASK;
+}
+
+static inline bool pmd_prefix(union pmd pmd)
+{
+ return pmd.h.fc && pmd.s.fc1.prefix_notif;
+}
+
+static inline bool pud_prefix(union pud pud)
+{
+ return pud.h.fc && pud.s.fc1.prefix_notif;
+}
+
+static inline bool crste_leaf(union crste crste)
+{
+ return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc;
+}
+
+static inline bool crste_prefix(union crste crste)
+{
+ return crste_leaf(crste) && crste.s.fc1.prefix_notif;
+}
+
+static inline bool crste_dirty(union crste crste)
+{
+ return crste_leaf(crste) && crste.s.fc1.d;
+}
+
+static inline union pgste *pgste_of(union pte *pte)
+{
+ return (union pgste *)(pte + _PAGE_ENTRIES);
+}
+
+static inline bool pte_hole(union pte pte)
+{
+ return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE;
+}
+
+static inline bool _crste_hole(union crste crste)
+{
+ return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE;
+}
+
+#define crste_hole(x) _crste_hole(_CRSTE(x))
+
+static inline bool _crste_none(union crste crste)
+{
+ return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE;
+}
+
+#define crste_none(x) _crste_none(_CRSTE(x))
+
+static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn)
+{
+ return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK);
+}
+
+static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn)
+{
+ return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK);
+}
+
+static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn)
+{
+ if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3))
+ return -1;
+ if (is_pmd(crste))
+ return large_pmd_to_phys(crste.pmd, gfn);
+ return large_pud_to_phys(crste.pud, gfn);
+}
+
+static inline bool cspg_crste(union crste *crstep, union crste old, union crste new)
+{
+ return cspg(&crstep->val, old.val, new.val);
+}
+
+static inline struct page_table *dereference_pmd(union pmd pmd)
+{
+ return phys_to_virt(crste_origin(pmd));
+}
+
+static inline struct segment_table *dereference_pud(union pud pud)
+{
+ return phys_to_virt(crste_origin(pud));
+}
+
+static inline struct region3_table *dereference_p4d(union p4d p4d)
+{
+ return phys_to_virt(crste_origin(p4d));
+}
+
+static inline struct region2_table *dereference_pgd(union pgd pgd)
+{
+ return phys_to_virt(crste_origin(pgd));
+}
+
+static inline struct crst_table *_dereference_crste(union crste crste)
+{
+ if (unlikely(is_pmd(crste)))
+ return NULL;
+ return phys_to_virt(crste_origin(crste.pud));
+}
+
+#define dereference_crste(x) (_Generic((x), \
+ union pud : _dereference_crste(_CRSTE(x)), \
+ union p4d : _dereference_crste(_CRSTE(x)), \
+ union pgd : _dereference_crste(_CRSTE(x)), \
+ union crste : _dereference_crste(_CRSTE(x))))
+
+static inline struct crst_table *dereference_asce(union asce asce)
+{
+ return phys_to_virt(asce.val & _ASCE_ORIGIN);
+}
+
+static inline void asce_flush_tlb(union asce asce)
+{
+ __tlb_flush_idte(asce.val);
+}
+
+static inline bool pgste_get_trylock(union pte *ptep, union pgste *res)
+{
+ union pgste *pgstep = pgste_of(ptep);
+ union pgste old_pgste;
+
+ if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT)
+ return false;
+ old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val);
+ if (old_pgste.pcl)
+ return false;
+ old_pgste.pcl = 1;
+ *res = old_pgste;
+ return true;
+}
+
+static inline union pgste pgste_get_lock(union pte *ptep)
+{
+ union pgste res;
+
+ while (!pgste_get_trylock(ptep, &res))
+ cpu_relax();
+ return res;
+}
+
+static inline void pgste_set_unlock(union pte *ptep, union pgste pgste)
+{
+ pgste.pcl = 0;
+ barrier();
+ WRITE_ONCE(*pgste_of(ptep), pgste);
+}
+
+static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce,
+ bool has_skeys)
+{
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys);
+ pgste_set_unlock(ptep, pgste);
+}
+
+static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys)
+{
+ dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys);
+}
+
+static inline void dat_free_pt(struct page_table *pt)
+{
+ free_page((unsigned long)pt);
+}
+
+static inline void _dat_free_crst(struct crst_table *table)
+{
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
+#define dat_free_crst(x) _dat_free_crst(_CRSTP(x))
+
+static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc)
+{
+ if (!mc)
+ return;
+ while (mc->n_pts)
+ dat_free_pt(mc->pts[--mc->n_pts]);
+ while (mc->n_crsts)
+ _dat_free_crst(mc->crsts[--mc->n_crsts]);
+ while (mc->n_rmaps)
+ kfree(mc->rmaps[--mc->n_rmaps]);
+ kfree(mc);
+}
+
+DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T))
+
+static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void)
+{
+ struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
+
+ mc = kzalloc_obj(*mc, GFP_KERNEL_ACCOUNT);
+ if (mc && !kvm_s390_mmu_cache_topup(mc))
+ return_ptr(mc);
+ return NULL;
+}
+
+static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new,
+ gfn_t gfn, union asce asce)
+{
+ return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce);
+}
+
+static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new,
+ gfn_t gfn, union asce asce)
+{
+ return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
+}
+
+static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce)
+{
+ union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt);
+
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce));
+ return oldcrste;
+}
+
+static inline int get_level(union crste *crstep, union pte *ptep)
+{
+ return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt;
+}
+
+static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+ unsigned long npages)
+{
+ return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING);
+}
+
+static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+ unsigned long npages)
+{
+ return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0);
+}
+
+static inline bool crste_is_ucas(union crste crste)
+{
+ return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0;
+}
+
+#endif /* __KVM_S390_DAT_H */
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 74f73141f9b9..d89d1c381522 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -10,12 +10,30 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <asm/gmap.h>
+#include <asm/gmap_helpers.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
#include "trace.h"
#include "trace-s390.h"
#include "gaccess.h"
+#include "gmap.h"
+
+static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslot_iter iter;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned long start, end;
+
+ slots = kvm_vcpu_memslots(vcpu);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ slot = iter.slot;
+ start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn));
+ end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages));
+ gmap_helper_discard(vcpu->kvm->mm, start, end);
+ }
+}
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
@@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end);
+ mmap_read_lock(vcpu->kvm->mm);
/*
* We checked for start >= end above, so lets check for the
* fast path (no prefix swap page involved)
*/
if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) {
- gmap_discard(vcpu->arch.gmap, start, end);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end));
} else {
/*
* This is slow path. gmap_discard will check for start
@@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
* prefix and let gmap_discard make some of these calls
* NOPs.
*/
- gmap_discard(vcpu->arch.gmap, start, prefix);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix));
if (start <= prefix)
- gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE);
+ do_discard_gfn_range(vcpu, 0, 1);
if (end > prefix + PAGE_SIZE)
- gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE);
- gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end);
+ do_discard_gfn_range(vcpu, 1, 2);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end));
}
+ mmap_read_unlock(vcpu->kvm->mm);
return 0;
}
diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c
new file mode 100644
index 000000000000..ddf0ca71f374
--- /dev/null
+++ b/arch/s390/kvm/faultin.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest fault handling.
+ *
+ * Copyright IBM Corp. 2025
+ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#include "gmap.h"
+#include "trace.h"
+#include "faultin.h"
+
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);
+
+/*
+ * kvm_s390_faultin_gfn() - handle a dat fault.
+ * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM.
+ * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU.
+ * @f: The guest fault that needs to be resolved.
+ *
+ * Return:
+ * * 0 on success
+ * * < 0 in case of error
+ * * > 0 in case of guest exceptions
+ *
+ * Context:
+ * * The mm lock must not be held before calling
+ * * kvm->srcu must be held
+ * * may sleep
+ */
+int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f)
+{
+ struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL;
+ struct kvm_s390_mmu_cache *mc = NULL;
+ struct kvm_memory_slot *slot;
+ unsigned long inv_seq;
+ int foll, rc = 0;
+
+ foll = f->write_attempt ? FOLL_WRITE : 0;
+ foll |= f->attempt_pfault ? FOLL_NOWAIT : 0;
+
+ if (vcpu) {
+ kvm = vcpu->kvm;
+ mc = vcpu->arch.mc;
+ }
+
+ lockdep_assert_held(&kvm->srcu);
+
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0)
+ return 0;
+ }
+
+ while (1) {
+ f->valid = false;
+ inv_seq = kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
+
+ if (vcpu)
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn);
+ else
+ slot = gfn_to_memslot(kvm, f->gfn);
+ f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page);
+
+ /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */
+ if (f->pfn == KVM_PFN_ERR_NEEDS_IO) {
+ if (unlikely(!f->attempt_pfault))
+ return -EAGAIN;
+ if (unlikely(!vcpu))
+ return -EINVAL;
+ trace_kvm_s390_major_guest_pfault(vcpu);
+ if (kvm_arch_setup_async_pf(vcpu))
+ return 0;
+ vcpu->stat.pfault_sync++;
+ /* Could not setup async pfault, try again synchronously. */
+ foll &= ~FOLL_NOWAIT;
+ f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page);
+ }
+
+ /* Access outside memory, addressing exception. */
+ if (is_noslot_pfn(f->pfn))
+ return PGM_ADDRESSING;
+ /* Signal pending: try again. */
+ if (f->pfn == KVM_PFN_ERR_SIGPENDING)
+ return -EAGAIN;
+ /* Check if it's read-only memory; don't try to actually handle that case. */
+ if (f->pfn == KVM_PFN_ERR_RO_FAULT)
+ return -EOPNOTSUPP;
+ /* Any other error. */
+ if (is_error_pfn(f->pfn))
+ return -EFAULT;
+
+ if (!mc) {
+ local_mc = kvm_s390_new_mmu_cache();
+ if (!local_mc)
+ return -ENOMEM;
+ mc = local_mc;
+ }
+
+ /* Loop, will automatically release the faulted page. */
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) {
+ kvm_release_faultin_page(kvm, f->page, true, false);
+ continue;
+ }
+
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) {
+ f->valid = true;
+ rc = gmap_link(mc, kvm->arch.gmap, f, slot);
+ kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt);
+ f->page = NULL;
+ }
+ }
+ kvm_release_faultin_page(kvm, f->page, true, false);
+
+ if (rc == -ENOMEM) {
+ rc = kvm_s390_mmu_cache_topup(mc);
+ if (rc)
+ return rc;
+ } else if (rc != -EAGAIN) {
+ return rc;
+ }
+ }
+}
+
+int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ int foll = w ? FOLL_WRITE : 0;
+
+ f->write_attempt = w;
+ f->gfn = gfn;
+ f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page);
+ if (is_noslot_pfn(f->pfn))
+ return PGM_ADDRESSING;
+ if (is_sigpending_pfn(f->pfn))
+ return -EINTR;
+ if (f->pfn == KVM_PFN_ERR_NEEDS_IO)
+ return -EAGAIN;
+ if (is_error_pfn(f->pfn))
+ return -EFAULT;
+
+ f->valid = true;
+ return 0;
+}
diff --git a/arch/s390/kvm/faultin.h b/arch/s390/kvm/faultin.h
new file mode 100644
index 000000000000..f86176d2769c
--- /dev/null
+++ b/arch/s390/kvm/faultin.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KVM guest fault handling.
+ *
+ * Copyright IBM Corp. 2025
+ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_FAULTIN_H
+#define __KVM_S390_FAULTIN_H
+
+#include <linux/kvm_host.h>
+
+#include "dat.h"
+
+int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f);
+int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w);
+
+static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm,
+ gfn_t gfn, bool wr)
+{
+ struct guest_fault f = { .gfn = gfn, .write_attempt = wr, };
+
+ return kvm_s390_faultin_gfn(vcpu, kvm, &f);
+}
+
+static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f,
+ gpa_t gaddr, unsigned long *val)
+{
+ int rc;
+
+ rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false);
+ if (rc)
+ return rc;
+
+ *val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr));
+
+ return 0;
+}
+
+static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults,
+ int n, bool ignore)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ kvm_release_faultin_page(kvm, guest_faults[i].page, ignore,
+ guest_faults[i].write_attempt);
+ guest_faults[i].page = NULL;
+ }
+}
+
+static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq,
+ struct guest_fault *guest_faults, int n,
+ bool unsafe)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ if (!guest_faults[i].valid)
+ continue;
+ if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn))
+ return true;
+ if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn))
+ return true;
+ }
+ return false;
+}
+
+static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults,
+ gfn_t start, int n_pages, bool write_attempt)
+{
+ int i, rc;
+
+ for (i = 0; i < n_pages; i++) {
+ rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+
+#define kvm_s390_release_faultin_array(kvm, array, ignore) \
+ kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore)
+
+#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \
+ kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true)
+
+#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \
+ kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false)
+
+#endif /* __KVM_S390_FAULTIN_H */
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index f6fded15633a..b07accd19618 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -11,40 +11,43 @@
#include <linux/err.h>
#include <linux/pgtable.h>
#include <linux/bitfield.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <asm/diag.h>
#include <asm/access-regs.h>
#include <asm/fault.h>
-#include <asm/gmap.h>
#include <asm/dat-bits.h>
#include "kvm-s390.h"
+#include "dat.h"
#include "gmap.h"
#include "gaccess.h"
+#include "faultin.h"
-/*
- * vaddress union in order to easily decode a virtual address into its
- * region first index, region second index etc. parts.
- */
-union vaddress {
- unsigned long addr;
- struct {
- unsigned long rfx : 11;
- unsigned long rsx : 11;
- unsigned long rtx : 11;
- unsigned long sx : 11;
- unsigned long px : 8;
- unsigned long bx : 12;
- };
- struct {
- unsigned long rfx01 : 2;
- unsigned long : 9;
- unsigned long rsx01 : 2;
- unsigned long : 9;
- unsigned long rtx01 : 2;
- unsigned long : 9;
- unsigned long sx01 : 2;
- unsigned long : 29;
- };
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
+union dat_table_entry {
+ unsigned long val;
+ union region1_table_entry pgd;
+ union region2_table_entry p4d;
+ union region3_table_entry pud;
+ union segment_table_entry pmd;
+ union page_table_entry pte;
};
+#define WALK_N_ENTRIES 7
+#define LEVEL_MEM -2
+struct pgtwalk {
+ struct guest_fault raw_entries[WALK_N_ENTRIES];
+ gpa_t last_addr;
+ int level;
+ bool p;
+};
+
+static inline struct guest_fault *get_entries(struct pgtwalk *w)
+{
+ return w->raw_entries - LEVEL_MEM;
+}
+
/*
* raddress union which will contain the result (real or absolute address)
* after a page table walk. The rfaa, sfaa and pfra members are used to
@@ -106,16 +109,33 @@ struct aste {
/* .. more fields there */
};
+union oac {
+ unsigned int val;
+ struct {
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac1;
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac2;
+ };
+};
+
int ipte_lock_held(struct kvm *kvm)
{
- if (sclp.has_siif) {
- int rc;
+ if (sclp.has_siif)
+ return kvm->arch.sca->ipte_control.kh != 0;
- read_lock(&kvm->arch.sca_lock);
- rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
- read_unlock(&kvm->arch.sca_lock);
- return rc;
- }
return kvm->arch.ipte_lock_count != 0;
}
@@ -128,19 +148,16 @@ static void ipte_lock_simple(struct kvm *kvm)
if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
if (old.k) {
- read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
out:
mutex_unlock(&kvm->arch.ipte_mutex);
}
@@ -153,14 +170,12 @@ static void ipte_unlock_simple(struct kvm *kvm)
kvm->arch.ipte_lock_count--;
if (kvm->arch.ipte_lock_count)
goto out;
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
new = old;
new.k = 0;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
wake_up(&kvm->arch.ipte_wq);
out:
mutex_unlock(&kvm->arch.ipte_mutex);
@@ -171,12 +186,10 @@ static void ipte_lock_siif(struct kvm *kvm)
union ipte_control old, new, *ic;
retry:
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
if (old.kg) {
- read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
@@ -184,15 +197,13 @@ retry:
new.k = 1;
new.kh++;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
}
static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
old = READ_ONCE(*ic);
do {
new = old;
@@ -200,7 +211,6 @@ static void ipte_unlock_siif(struct kvm *kvm)
if (!new.kh)
new.k = 0;
} while (!try_cmpxchg(&ic->val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
if (!new.kh)
wake_up(&kvm->arch.ipte_wq);
}
@@ -318,7 +328,7 @@ enum prot_type {
PROT_TYPE_DAT = 3,
PROT_TYPE_IEP = 4,
/* Dummy value for passing an initialized value when code != PGM_PROTECTION */
- PROT_NONE,
+ PROT_TYPE_DUMMY,
};
static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
@@ -334,7 +344,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva,
switch (code) {
case PGM_PROTECTION:
switch (prot) {
- case PROT_NONE:
+ case PROT_TYPE_DUMMY:
/* We should never get here, acts like termination */
WARN_ON_ONCE(1);
break;
@@ -437,7 +447,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
}
/**
- * guest_translate - translate a guest virtual into a guest absolute address
+ * guest_translate_gva() - translate a guest virtual into a guest absolute address
* @vcpu: virtual cpu
* @gva: guest virtual address
* @gpa: points to where guest physical (absolute) address should be stored
@@ -457,9 +467,9 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
* the returned value is the program interruption code as defined
* by the architecture
*/
-static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
- unsigned long *gpa, const union asce asce,
- enum gacc_mode mode, enum prot_type *prot)
+static unsigned long guest_translate_gva(struct kvm_vcpu *vcpu, unsigned long gva,
+ unsigned long *gpa, const union asce asce,
+ enum gacc_mode mode, enum prot_type *prot)
{
union vaddress vaddr = {.addr = gva};
union raddress raddr = {.addr = gva};
@@ -640,31 +650,19 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int vm_check_access_key(struct kvm *kvm, u8 access_key,
- enum gacc_mode mode, gpa_t gpa)
+static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key,
+ enum gacc_mode mode, gpa_t gpa)
{
- u8 storage_key, access_control;
- bool fetch_protected;
- unsigned long hva;
+ union skey storage_key;
int r;
- if (access_key == 0)
- return 0;
-
- hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
- if (kvm_is_error_hva(hva))
- return PGM_ADDRESSING;
-
- mmap_read_lock(current->mm);
- r = get_guest_storage_key(current->mm, hva, &storage_key);
- mmap_read_unlock(current->mm);
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key);
if (r)
return r;
- access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
- if (access_control == access_key)
+ if (access_key == 0 || storage_key.acc == access_key)
return 0;
- fetch_protected = storage_key & _PAGE_FP_BIT;
- if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+ if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp)
return 0;
return PGM_PROTECTION;
}
@@ -703,12 +701,11 @@ static bool storage_prot_override_applies(u8 access_control)
return access_control == PAGE_SPO_ACC;
}
-static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
- enum gacc_mode mode, union asce asce, gpa_t gpa,
- unsigned long ga, unsigned int len)
+static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key,
+ enum gacc_mode mode, union asce asce, gpa_t gpa,
+ unsigned long ga, unsigned int len)
{
- u8 storage_key, access_control;
- unsigned long hva;
+ union skey storage_key;
int r;
/* access key 0 matches any storage key -> allow */
@@ -718,26 +715,23 @@ static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
* caller needs to ensure that gfn is accessible, so we can
* assume that this cannot fail
*/
- hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa));
- mmap_read_lock(current->mm);
- r = get_guest_storage_key(current->mm, hva, &storage_key);
- mmap_read_unlock(current->mm);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key);
if (r)
return r;
- access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
/* access key matches storage key -> allow */
- if (access_control == access_key)
+ if (storage_key.acc == access_key)
return 0;
if (mode == GACC_FETCH || mode == GACC_IFETCH) {
/* it is a fetch and fetch protection is off -> allow */
- if (!(storage_key & _PAGE_FP_BIT))
+ if (!storage_key.fp)
return 0;
if (fetch_prot_override_applicable(vcpu, mode, asce) &&
fetch_prot_override_applies(ga, len))
return 0;
}
if (storage_prot_override_applicable(vcpu) &&
- storage_prot_override_applies(access_control))
+ storage_prot_override_applies(storage_key.acc))
return 0;
return PGM_PROTECTION;
}
@@ -797,20 +791,19 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
PROT_TYPE_LA);
if (psw_bits(*psw).dat) {
- rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot);
+ rc = guest_translate_gva(vcpu, ga, &gpa, asce, mode, &prot);
if (rc < 0)
return rc;
} else {
gpa = kvm_s390_real_to_abs(vcpu, ga);
if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
rc = PGM_ADDRESSING;
- prot = PROT_NONE;
+ prot = PROT_TYPE_DUMMY;
}
}
if (rc)
return trans_exc(vcpu, rc, ga, ar, mode, prot);
- rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga,
- fragment_len);
+ rc = vcpu_check_access_key_gpa(vcpu, access_key, mode, asce, gpa, ga, fragment_len);
if (rc)
return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC);
if (gpas)
@@ -822,8 +815,8 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
return 0;
}
-static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
- void *data, unsigned int len)
+static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+ void *data, unsigned int len)
{
const unsigned int offset = offset_in_page(gpa);
const gfn_t gfn = gpa_to_gfn(gpa);
@@ -838,38 +831,79 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
return rc;
}
-static int
-access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
- void *data, unsigned int len, u8 access_key)
+static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key)
{
- struct kvm_memory_slot *slot;
- bool writable;
- gfn_t gfn;
- hva_t hva;
- int rc;
+ union oac spec = {
+ .oac1.key = dst_key,
+ .oac1.k = !!dst_key,
+ .oac2.key = src_key,
+ .oac2.k = !!src_key,
+ };
+ int exception = PGM_PROTECTION;
+
+ asm_inline volatile(
+ " lr %%r0,%[spec]\n"
+ "0: mvcos %[to],%[from],%[size]\n"
+ "1: lhi %[exc],0\n"
+ "2:\n"
+ EX_TABLE(0b, 2b)
+ EX_TABLE(1b, 2b)
+ : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception)
+ : [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
+ : "memory", "cc", "0");
+ return exception;
+}
- gfn = gpa >> PAGE_SHIFT;
- slot = gfn_to_memslot(kvm, gfn);
- hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+struct acc_page_key_context {
+ void *data;
+ int exception;
+ unsigned short offset;
+ unsigned short len;
+ bool store;
+ u8 access_key;
+};
- if (kvm_is_error_hva(hva))
- return PGM_ADDRESSING;
- /*
- * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
- * Don't try to actually handle that case.
- */
- if (!writable && mode == GACC_STORE)
- return -EOPNOTSUPP;
- hva += offset_in_page(gpa);
- if (mode == GACC_STORE)
- rc = copy_to_user_key((void __user *)hva, data, len, access_key);
+static void _access_guest_page_with_key_gpa(struct guest_fault *f)
+{
+ struct acc_page_key_context *context = f->priv;
+ void *ptr;
+ int r;
+
+ ptr = __va(PFN_PHYS(f->pfn) | context->offset);
+
+ if (context->store)
+ r = mvcos_key(ptr, context->data, context->len, context->access_key, 0);
else
- rc = copy_from_user_key(data, (void __user *)hva, len, access_key);
+ r = mvcos_key(context->data, ptr, context->len, 0, context->access_key);
+
+ context->exception = r;
+}
+
+static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+ void *data, unsigned int len, u8 acc)
+{
+ struct acc_page_key_context context = {
+ .offset = offset_in_page(gpa),
+ .len = len,
+ .data = data,
+ .access_key = acc,
+ .store = mode == GACC_STORE,
+ };
+ struct guest_fault fault = {
+ .gfn = gpa_to_gfn(gpa),
+ .priv = &context,
+ .write_attempt = mode == GACC_STORE,
+ .callback = _access_guest_page_with_key_gpa,
+ };
+ int rc;
+
+ if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm))
+ return -EINVAL;
+
+ rc = kvm_s390_faultin_gfn(NULL, kvm, &fault);
if (rc)
- return PGM_PROTECTION;
- if (mode == GACC_STORE)
- mark_page_dirty_in_slot(kvm, slot, gfn);
- return 0;
+ return rc;
+ return context.exception;
}
int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
@@ -881,7 +915,7 @@ int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
while (min(PAGE_SIZE - offset, len) > 0) {
fragment_len = min(PAGE_SIZE - offset, len);
- rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key);
+ rc = access_guest_page_with_key_gpa(kvm, mode, gpa, data, fragment_len, access_key);
if (rc)
return rc;
offset = 0;
@@ -941,15 +975,14 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
for (idx = 0; idx < nr_pages; idx++) {
fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len);
if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) {
- rc = access_guest_page(vcpu->kvm, mode, gpas[idx],
- data, fragment_len);
+ rc = access_guest_page_gpa(vcpu->kvm, mode, gpas[idx], data, fragment_len);
} else {
- rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
- data, fragment_len, access_key);
+ rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len, access_key);
}
if (rc == PGM_PROTECTION && try_storage_prot_override)
- rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
- data, fragment_len, PAGE_SPO_ACC);
+ rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx],
+ data, fragment_len, PAGE_SPO_ACC);
if (rc)
break;
len -= fragment_len;
@@ -962,7 +995,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
if (rc == PGM_PROTECTION)
prot = PROT_TYPE_KEYC;
else
- prot = PROT_NONE;
+ prot = PROT_TYPE_DUMMY;
rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
}
out_unlock:
@@ -983,7 +1016,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
while (len && !rc) {
gpa = kvm_s390_real_to_abs(vcpu, gra);
fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len);
- rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len);
+ rc = access_guest_page_gpa(vcpu->kvm, mode, gpa, data, fragment_len);
len -= fragment_len;
gra += fragment_len;
data += fragment_len;
@@ -994,17 +1027,101 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
}
/**
+ * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys.
+ * @ptr: Address of value to compare to *@old and exchange with
+ * @new. Must be aligned to @size.
+ * @old: Old value. Compared to the content pointed to by @ptr in order to
+ * determine if the exchange occurs. The old value read from *@ptr is
+ * written here.
+ * @new: New value to place at *@ptr.
+ * @size: Size of the operation in bytes, may only be a power of two up to 16.
+ * @access_key: Access key to use for checking storage key protection.
+ *
+ * Perform a cmpxchg on guest memory, honoring storage key protection.
+ * @access_key alone determines how key checking is performed, neither
+ * storage-protection-override nor fetch-protection-override apply.
+ * In case of an exception *@uval is set to zero.
+ *
+ * Return:
+ * * %0: cmpxchg executed successfully
+ * * %1: cmpxchg executed unsuccessfully
+ * * %PGM_PROTECTION: an exception happened when trying to access *@ptr
+ * * %-EAGAIN: maxed out number of retries (byte and short only)
+ * * %-EINVAL: invalid value for @size
+ */
+static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old,
+ union kvm_s390_quad new, int size, u8 access_key)
+{
+ union kvm_s390_quad tmp = { .sixteen = 0 };
+ int rc;
+
+ /*
+ * The cmpxchg_key macro depends on the type of "old", so we need
+ * a case for each valid length and get some code duplication as long
+ * as we don't introduce a new macro.
+ */
+ switch (size) {
+ case 1:
+ rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key);
+ break;
+ case 2:
+ rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key);
+ break;
+ case 4:
+ rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key);
+ break;
+ case 8:
+ rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key);
+ break;
+ case 16:
+ rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen,
+ access_key);
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (!rc && memcmp(&tmp, old, size))
+ rc = 1;
+ *old = tmp;
+ /*
+ * Assume that the fault is caused by protection, either key protection
+ * or user page write protection.
+ */
+ if (rc == -EFAULT)
+ rc = PGM_PROTECTION;
+ return rc;
+}
+
+struct cmpxchg_key_context {
+ union kvm_s390_quad new;
+ union kvm_s390_quad *old;
+ int exception;
+ unsigned short offset;
+ u8 access_key;
+ u8 len;
+};
+
+static void _cmpxchg_guest_abs_with_key(struct guest_fault *f)
+{
+ struct cmpxchg_key_context *context = f->priv;
+
+ context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset),
+ context->old, context->new, context->len,
+ context->access_key);
+}
+
+/**
* cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
* @kvm: Virtual machine instance.
* @gpa: Absolute guest address of the location to be changed.
* @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
* non power of two will result in failure.
- * @old_addr: Pointer to old value. If the location at @gpa contains this value,
- * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
- * *@old_addr contains the value at @gpa before the attempt to
- * exchange the value.
+ * @old: Pointer to old value. If the location at @gpa contains this value,
+ * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ * *@old contains the value at @gpa before the attempt to
+ * exchange the value.
* @new: The value to place at @gpa.
- * @access_key: The access key to use for the guest access.
+ * @acc: The access key to use for the guest access.
* @success: output value indicating if an exchange occurred.
*
* Atomically exchange the value at @gpa by @new, if it contains *@old.
@@ -1017,89 +1134,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* * -EAGAIN: transient failure (len 1 or 2)
* * -EOPNOTSUPP: read-only memslot (should never occur)
*/
-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
- __uint128_t *old_addr, __uint128_t new,
- u8 access_key, bool *success)
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
+ union kvm_s390_quad new, u8 acc, bool *success)
{
- gfn_t gfn = gpa_to_gfn(gpa);
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
- bool writable;
- hva_t hva;
- int ret;
-
- if (!IS_ALIGNED(gpa, len))
- return -EINVAL;
-
- hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
- if (kvm_is_error_hva(hva))
- return PGM_ADDRESSING;
- /*
- * Check if it's a read-only memslot, even though that cannot occur
- * since those are unsupported.
- * Don't try to actually handle that case.
- */
- if (!writable)
- return -EOPNOTSUPP;
-
- hva += offset_in_page(gpa);
- /*
- * The cmpxchg_user_key macro depends on the type of "old", so we need
- * a case for each valid length and get some code duplication as long
- * as we don't introduce a new macro.
- */
- switch (len) {
- case 1: {
- u8 old;
-
- ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key);
- *success = !ret && old == *old_addr;
- *old_addr = old;
- break;
- }
- case 2: {
- u16 old;
-
- ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key);
- *success = !ret && old == *old_addr;
- *old_addr = old;
- break;
- }
- case 4: {
- u32 old;
-
- ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key);
- *success = !ret && old == *old_addr;
- *old_addr = old;
- break;
- }
- case 8: {
- u64 old;
+ struct cmpxchg_key_context context = {
+ .old = old,
+ .new = new,
+ .offset = offset_in_page(gpa),
+ .len = len,
+ .access_key = acc,
+ };
+ struct guest_fault fault = {
+ .gfn = gpa_to_gfn(gpa),
+ .priv = &context,
+ .write_attempt = true,
+ .callback = _cmpxchg_guest_abs_with_key,
+ };
+ int rc;
- ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key);
- *success = !ret && old == *old_addr;
- *old_addr = old;
- break;
- }
- case 16: {
- __uint128_t old;
+ lockdep_assert_held(&kvm->srcu);
- ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key);
- *success = !ret && old == *old_addr;
- *old_addr = old;
- break;
- }
- default:
+ if (len > 16 || !IS_ALIGNED(gpa, len))
return -EINVAL;
- }
- if (*success)
- mark_page_dirty_in_slot(kvm, slot, gfn);
- /*
- * Assume that the fault is caused by protection, either key protection
- * or user page write protection.
- */
- if (ret == -EFAULT)
- ret = PGM_PROTECTION;
- return ret;
+
+ rc = kvm_s390_faultin_gfn(NULL, kvm, &fault);
+ if (rc)
+ return rc;
+ *success = !context.exception;
+ if (context.exception == 1)
+ return 0;
+ return context.exception;
}
/**
@@ -1174,7 +1238,7 @@ int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
while (length && !rc) {
fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length);
- rc = vm_check_access_key(kvm, access_key, mode, gpa);
+ rc = vm_check_access_key_gpa(kvm, access_key, mode, gpa);
length -= fragment_len;
gpa += fragment_len;
}
@@ -1201,304 +1265,409 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
}
/**
- * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the beginning of the page table for the given address if
- * successful (return value 0), or to the first invalid DAT entry in
- * case of exceptions (return value > 0)
- * @dat_protection: referenced memory is write protected
- * @fake: pgt references contiguous guest memory block, not a pgtable
+ * walk_guest_tables() - Walk the guest page table and pin the dat tables.
+ * @sg: Pointer to the shadow guest address space structure.
+ * @saddr: Faulting address in the shadow gmap.
+ * @w: Will be filled with information on the pinned pages.
+ * @wr: Wndicates a write access if true.
+ *
+ * Return:
+ * * %0 in case of success,
+ * * a PIC code > 0 in case the address translation fails
+ * * an error code < 0 if other errors happen in the host
*/
-static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
- unsigned long *pgt, int *dat_protection,
- int *fake)
+static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr)
{
- struct kvm *kvm;
- struct gmap *parent;
- union asce asce;
+ struct gmap *parent = sg->parent;
+ struct guest_fault *entries;
+ union dat_table_entry table;
union vaddress vaddr;
unsigned long ptr;
+ struct kvm *kvm;
+ union asce asce;
int rc;
- *fake = 0;
- *dat_protection = 0;
- kvm = sg->private;
- parent = sg->parent;
+ if (!parent)
+ return -EAGAIN;
+ kvm = parent->kvm;
+ WARN_ON(!kvm);
+ asce = sg->guest_asce;
+ entries = get_entries(w);
+
+ w->level = LEVEL_MEM;
+ w->last_addr = saddr;
+ if (asce.r)
+ return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false);
+
vaddr.addr = saddr;
- asce.val = sg->orig_asce;
ptr = asce.rsto * PAGE_SIZE;
- if (asce.r) {
- *fake = 1;
- ptr = 0;
- asce.dt = ASCE_TYPE_REGION1;
- }
+
+ if (!asce_contains_gfn(asce, gpa_to_gfn(saddr)))
+ return PGM_ASCE_TYPE;
switch (asce.dt) {
case ASCE_TYPE_REGION1:
- if (vaddr.rfx01 > asce.tl && !*fake)
+ if (vaddr.rfx01 > asce.tl)
return PGM_REGION_FIRST_TRANS;
break;
case ASCE_TYPE_REGION2:
- if (vaddr.rfx)
- return PGM_ASCE_TYPE;
if (vaddr.rsx01 > asce.tl)
return PGM_REGION_SECOND_TRANS;
break;
case ASCE_TYPE_REGION3:
- if (vaddr.rfx || vaddr.rsx)
- return PGM_ASCE_TYPE;
if (vaddr.rtx01 > asce.tl)
return PGM_REGION_THIRD_TRANS;
break;
case ASCE_TYPE_SEGMENT:
- if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
- return PGM_ASCE_TYPE;
if (vaddr.sx01 > asce.tl)
return PGM_SEGMENT_TRANSLATION;
break;
}
+ w->level = asce.dt;
switch (asce.dt) {
- case ASCE_TYPE_REGION1: {
- union region1_table_entry rfte;
-
- if (*fake) {
- ptr += vaddr.rfx * _REGION1_SIZE;
- rfte.val = ptr;
- goto shadow_r2t;
- }
- *pgt = ptr + vaddr.rfx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+ case ASCE_TYPE_REGION1:
+ w->last_addr = ptr + vaddr.rfx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (rfte.i)
+ if (table.pgd.i)
return PGM_REGION_FIRST_TRANS;
- if (rfte.tt != TABLE_TYPE_REGION1)
+ if (table.pgd.tt != TABLE_TYPE_REGION1)
return PGM_TRANSLATION_SPEC;
- if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+ if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl)
return PGM_REGION_SECOND_TRANS;
if (sg->edat_level >= 1)
- *dat_protection |= rfte.p;
- ptr = rfte.rto * PAGE_SIZE;
-shadow_r2t:
- rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
- if (rc)
- return rc;
- kvm->stat.gmap_shadow_r1_entry++;
- }
+ w->p |= table.pgd.p;
+ ptr = table.pgd.rto * PAGE_SIZE;
+ w->level--;
fallthrough;
- case ASCE_TYPE_REGION2: {
- union region2_table_entry rste;
-
- if (*fake) {
- ptr += vaddr.rsx * _REGION2_SIZE;
- rste.val = ptr;
- goto shadow_r3t;
- }
- *pgt = ptr + vaddr.rsx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+ case ASCE_TYPE_REGION2:
+ w->last_addr = ptr + vaddr.rsx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (rste.i)
+ if (table.p4d.i)
return PGM_REGION_SECOND_TRANS;
- if (rste.tt != TABLE_TYPE_REGION2)
+ if (table.p4d.tt != TABLE_TYPE_REGION2)
return PGM_TRANSLATION_SPEC;
- if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+ if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl)
return PGM_REGION_THIRD_TRANS;
if (sg->edat_level >= 1)
- *dat_protection |= rste.p;
- ptr = rste.rto * PAGE_SIZE;
-shadow_r3t:
- rste.p |= *dat_protection;
- rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
- if (rc)
- return rc;
- kvm->stat.gmap_shadow_r2_entry++;
- }
+ w->p |= table.p4d.p;
+ ptr = table.p4d.rto * PAGE_SIZE;
+ w->level--;
fallthrough;
- case ASCE_TYPE_REGION3: {
- union region3_table_entry rtte;
-
- if (*fake) {
- ptr += vaddr.rtx * _REGION3_SIZE;
- rtte.val = ptr;
- goto shadow_sgt;
- }
- *pgt = ptr + vaddr.rtx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+ case ASCE_TYPE_REGION3:
+ w->last_addr = ptr + vaddr.rtx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (rtte.i)
+ if (table.pud.i)
return PGM_REGION_THIRD_TRANS;
- if (rtte.tt != TABLE_TYPE_REGION3)
+ if (table.pud.tt != TABLE_TYPE_REGION3)
return PGM_TRANSLATION_SPEC;
- if (rtte.cr && asce.p && sg->edat_level >= 2)
+ if (table.pud.cr && asce.p && sg->edat_level >= 2)
return PGM_TRANSLATION_SPEC;
- if (rtte.fc && sg->edat_level >= 2) {
- *dat_protection |= rtte.fc0.p;
- *fake = 1;
- ptr = rtte.fc1.rfaa * _REGION3_SIZE;
- rtte.val = ptr;
- goto shadow_sgt;
+ if (sg->edat_level >= 1)
+ w->p |= table.pud.p;
+ if (table.pud.fc && sg->edat_level >= 2) {
+ table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK);
+ goto edat_applies;
}
- if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+ if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl)
return PGM_SEGMENT_TRANSLATION;
- if (sg->edat_level >= 1)
- *dat_protection |= rtte.fc0.p;
- ptr = rtte.fc0.sto * PAGE_SIZE;
-shadow_sgt:
- rtte.fc0.p |= *dat_protection;
- rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
- if (rc)
- return rc;
- kvm->stat.gmap_shadow_r3_entry++;
- }
+ ptr = table.pud.fc0.sto * PAGE_SIZE;
+ w->level--;
fallthrough;
- case ASCE_TYPE_SEGMENT: {
- union segment_table_entry ste;
-
- if (*fake) {
- ptr += vaddr.sx * _SEGMENT_SIZE;
- ste.val = ptr;
- goto shadow_pgt;
- }
- *pgt = ptr + vaddr.sx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+ case ASCE_TYPE_SEGMENT:
+ w->last_addr = ptr + vaddr.sx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (ste.i)
+ if (table.pmd.i)
return PGM_SEGMENT_TRANSLATION;
- if (ste.tt != TABLE_TYPE_SEGMENT)
+ if (table.pmd.tt != TABLE_TYPE_SEGMENT)
return PGM_TRANSLATION_SPEC;
- if (ste.cs && asce.p)
+ if (table.pmd.cs && asce.p)
return PGM_TRANSLATION_SPEC;
- *dat_protection |= ste.fc0.p;
- if (ste.fc && sg->edat_level >= 1) {
- *fake = 1;
- ptr = ste.fc1.sfaa * _SEGMENT_SIZE;
- ste.val = ptr;
- goto shadow_pgt;
+ w->p |= table.pmd.p;
+ if (table.pmd.fc && sg->edat_level >= 1) {
+ table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK);
+ goto edat_applies;
}
- ptr = ste.fc0.pto * (PAGE_SIZE / 2);
-shadow_pgt:
- ste.fc0.p |= *dat_protection;
- rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+ ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2);
+ w->level--;
+ }
+ w->last_addr = ptr + vaddr.px * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
+ if (rc)
+ return rc;
+ if (table.pte.i)
+ return PGM_PAGE_TRANSLATION;
+ if (table.pte.z)
+ return PGM_TRANSLATION_SPEC;
+ w->p |= table.pte.p;
+edat_applies:
+ if (wr && w->p)
+ return PGM_PROTECTION;
+
+ return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr);
+}
+
+static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep,
+ struct guest_fault *f, bool p)
+{
+ union pgste pgste;
+ union pte newpte;
+ int rc;
+
+ lockdep_assert_held(&sg->kvm->mmu_lock);
+ lockdep_assert_held(&sg->parent->children_lock);
+
+ scoped_guard(spinlock, &sg->host_to_rmap_lock)
+ rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE);
+ if (rc)
+ return rc;
+
+ if (!pgste_get_trylock(ptep_h, &pgste))
+ return -EAGAIN;
+ newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s);
+ newpte.s.d |= ptep_h->s.d;
+ newpte.s.sd |= ptep_h->s.sd;
+ newpte.h.p &= ptep_h->h.p;
+ if (!newpte.h.p && !f->writable) {
+ rc = -EOPNOTSUPP;
+ } else {
+ pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
+ pgste.vsie_notif = 1;
+ }
+ pgste_set_unlock(ptep_h, pgste);
+ if (rc)
+ return rc;
+ if (sg->invalidated)
+ return -EAGAIN;
+
+ newpte = _pte(f->pfn, 0, !p, 0);
+ if (!pgste_get_trylock(ptep, &pgste))
+ return -EAGAIN;
+ pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg));
+ pgste_set_unlock(ptep, pgste);
+
+ return 0;
+}
+
+static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
+ struct guest_fault *f, bool p)
+{
+ union crste newcrste, oldcrste;
+ gfn_t gfn;
+ int rc;
+
+ lockdep_assert_held(&sg->kvm->mmu_lock);
+ lockdep_assert_held(&sg->parent->children_lock);
+
+ gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK);
+ scoped_guard(spinlock, &sg->host_to_rmap_lock)
+ rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt);
+ if (rc)
+ return rc;
+
+ do {
+ /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */
+ if (sg->invalidated)
+ return -EAGAIN;
+ oldcrste = READ_ONCE(*host);
+ newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p);
+ newcrste.s.fc1.d |= oldcrste.s.fc1.d;
+ newcrste.s.fc1.sd |= oldcrste.s.fc1.sd;
+ newcrste.h.p &= oldcrste.h.p;
+ newcrste.s.fc1.vsie_notif = 1;
+ newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif;
+ newcrste.s.fc1.s = oldcrste.s.fc1.s;
+ if (!newcrste.h.p && !f->writable)
+ return -EOPNOTSUPP;
+ } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false));
+ if (sg->invalidated)
+ return -EAGAIN;
+
+ newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
+ gfn = gpa_to_gfn(raddr);
+ while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce))
+ ;
+ return 0;
+}
+
+static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+ unsigned long saddr, struct pgtwalk *w)
+{
+ struct guest_fault *entries;
+ int flags, i, hl, gl, l, rc;
+ union crste *table, *host;
+ union pte *ptep, *ptep_h;
+
+ lockdep_assert_held(&sg->kvm->mmu_lock);
+ lockdep_assert_held(&sg->parent->children_lock);
+
+ entries = get_entries(w);
+ ptep_h = NULL;
+ ptep = NULL;
+
+ rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE,
+ &table, &ptep);
+ if (rc)
+ return rc;
+
+ /* A race occurred. The shadow mapping is already valid, nothing to do */
+ if ((ptep && !ptep->h.i && ptep->h.p == w->p) ||
+ (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p))
+ return 0;
+
+ gl = get_level(table, ptep);
+
+ /* In case of a real address space */
+ if (w->level <= LEVEL_MEM) {
+ l = TABLE_TYPE_PAGE_TABLE;
+ hl = TABLE_TYPE_REGION1;
+ goto real_address_space;
+ }
+
+ /*
+ * Skip levels that are already protected. For each level, protect
+ * only the page containing the entry, not the whole table.
+ */
+ for (i = gl ; i >= w->level; i--) {
+ rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr),
+ entries[i].pfn, i + 1, entries[i].writable);
if (rc)
return rc;
- kvm->stat.gmap_shadow_sg_entry++;
+ if (sg->invalidated)
+ return -EAGAIN;
}
+
+ rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF,
+ TABLE_TYPE_PAGE_TABLE, &host, &ptep_h);
+ if (rc)
+ return rc;
+
+ hl = get_level(host, ptep_h);
+ /* Get the smallest granularity */
+ l = min3(gl, hl, w->level);
+
+real_address_space:
+ flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
+ /* If necessary, create the shadow mapping */
+ if (l < gl) {
+ rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep);
+ if (rc)
+ return rc;
}
- /* Return the parent address of the page table */
- *pgt = ptr;
- return 0;
+ if (l < hl) {
+ rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce,
+ flags, l, &host, &ptep_h);
+ if (rc)
+ return rc;
+ }
+
+ if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm))
+ return -EFAULT;
+ if (l == TABLE_TYPE_PAGE_TABLE)
+ return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p);
+ return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p);
}
-/**
- * shadow_pgt_lookup() - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt,
- int *dat_protection, int *fake)
+static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ unsigned long seq, struct pgtwalk *walk)
{
- unsigned long pt_index;
- unsigned long *table;
- struct page *page;
+ struct gmap *parent;
int rc;
- spin_lock(&sg->guest_table_lock);
- table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
- if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
- /* Shadow page tables are full pages (pte+pgste) */
- page = pfn_to_page(*table >> PAGE_SHIFT);
- pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page));
- *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE;
- *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
- *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE);
- rc = 0;
- } else {
- rc = -EAGAIN;
+ if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries))
+ return -EAGAIN;
+again:
+ rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc);
+ if (rc)
+ return rc;
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+ if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries))
+ return -EAGAIN;
+ parent = READ_ONCE(sg->parent);
+ if (!parent)
+ return -EAGAIN;
+ scoped_guard(spinlock, &parent->children_lock) {
+ if (READ_ONCE(sg->parent) != parent)
+ return -EAGAIN;
+ sg->invalidated = false;
+ rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk);
+ }
+ if (rc == -ENOMEM)
+ goto again;
+ if (!rc)
+ kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false);
}
- spin_unlock(&sg->guest_table_lock);
return rc;
}
/**
- * kvm_s390_shadow_fault - handle fault on a shadow page table
- * @vcpu: virtual cpu
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @datptr: will contain the address of the faulting DAT table entry, or of
- * the valid leaf, plus some flags
+ * __gaccess_shadow_fault() - Handle fault on a shadow page table.
+ * @vcpu: Virtual cpu that triggered the action.
+ * @sg: The shadow guest address space structure.
+ * @saddr: Faulting address in the shadow gmap.
+ * @datptr: Will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags.
+ * @wr: Whether this is a write access.
*
- * Returns: - 0 if the shadow fault was successfully resolved
- * - > 0 (pgm exception code) on exceptions while faulting
- * - -EAGAIN if the caller can retry immediately
- * - -EFAULT when accessing invalid guest addresses
- * - -ENOMEM if out of memory
+ * Return:
+ * * %0 if the shadow fault was successfully resolved
+ * * > 0 (pgm exception code) on exceptions while faulting
+ * * %-EAGAIN if the caller can retry immediately
+ * * %-EFAULT when accessing invalid guest addresses
+ * * %-ENOMEM if out of memory
*/
-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr, unsigned long *datptr)
+static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ union mvpg_pei *datptr, bool wr)
{
- union vaddress vaddr;
- union page_table_entry pte;
- unsigned long pgt = 0;
- int dat_protection, fake;
+ struct pgtwalk walk = { .p = false, };
+ unsigned long seq;
int rc;
- if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm))
- return -EFAULT;
-
- mmap_read_lock(sg->mm);
- /*
- * We don't want any guest-2 tables to change - so the parent
- * tables/pointers we read stay valid - unshadowing is however
- * always possible - only guest_table_lock protects us.
- */
- ipte_lock(vcpu->kvm);
+ seq = vcpu->kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
- rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ rc = walk_guest_tables(sg, saddr, &walk, wr);
+ if (datptr) {
+ datptr->val = walk.last_addr;
+ datptr->dat_prot = wr && walk.p;
+ datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE;
+ datptr->real = sg->guest_asce.r;
+ }
+ if (!rc)
+ rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk);
if (rc)
- rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
- &fake);
+ kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true);
+ return rc;
+}
- vaddr.addr = saddr;
- if (fake) {
- pte.val = pgt + vaddr.px * PAGE_SIZE;
- goto shadow_page;
- }
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ union mvpg_pei *datptr, bool wr)
+{
+ int rc;
- switch (rc) {
- case PGM_SEGMENT_TRANSLATION:
- case PGM_REGION_THIRD_TRANS:
- case PGM_REGION_SECOND_TRANS:
- case PGM_REGION_FIRST_TRANS:
- pgt |= PEI_NOT_PTE;
- break;
- case 0:
- pgt += vaddr.px * 8;
- rc = gmap_read_table(sg->parent, pgt, &pte.val);
- }
- if (datptr)
- *datptr = pgt | dat_protection * PEI_DAT_PROT;
- if (!rc && pte.i)
- rc = PGM_PAGE_TRANSLATION;
- if (!rc && pte.z)
- rc = PGM_TRANSLATION_SPEC;
-shadow_page:
- pte.p |= dat_protection;
- if (!rc)
- rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
- vcpu->kvm->stat.gmap_shadow_pg_entry++;
+ if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm))
+ return -EFAULT;
+
+ rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc);
+ if (rc)
+ return rc;
+
+ ipte_lock(vcpu->kvm);
+ rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r);
ipte_unlock(vcpu->kvm);
- mmap_read_unlock(sg->mm);
+
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 3fde45a151f2..b5385cec60f4 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
- __uint128_t new, u8 access_key, bool *success);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
+ union kvm_s390_quad new, u8 access_key, bool *success);
/**
* write_guest_with_key - copy data from kernel space to guest space
@@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm);
int ipte_lock_held(struct kvm *kvm);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
-/* MVPG PEI indication bits */
-#define PEI_DAT_PROT 2
-#define PEI_NOT_PTE 4
+union mvpg_pei {
+ unsigned long val;
+ struct {
+ unsigned long addr : 61;
+ unsigned long not_pte : 1;
+ unsigned long dat_prot: 1;
+ unsigned long real : 1;
+ };
+};
-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr, unsigned long *datptr);
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ union mvpg_pei *datptr, bool wr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c
deleted file mode 100644
index a6d1dbb04c97..000000000000
--- a/arch/s390/kvm/gmap-vsie.c
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Guest memory management for KVM/s390 nested VMs.
- *
- * Copyright IBM Corp. 2008, 2020, 2024
- *
- * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
- * Martin Schwidefsky <schwidefsky@de.ibm.com>
- * David Hildenbrand <david@redhat.com>
- * Janosch Frank <frankja@linux.vnet.ibm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/pgtable.h>
-#include <linux/pagemap.h>
-#include <linux/mman.h>
-
-#include <asm/lowcore.h>
-#include <asm/gmap.h>
-#include <asm/uv.h>
-
-#include "kvm-s390.h"
-#include "gmap.h"
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- *
- * Context: Called with parent->shadow_lock held
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
- struct gmap *sg;
-
- lockdep_assert_held(&parent->shadow_lock);
- list_for_each_entry(sg, &parent->children, list) {
- if (!gmap_shadow_valid(sg, asce, edat_level))
- continue;
- if (!sg->initialized)
- return ERR_PTR(-EAGAIN);
- refcount_inc(&sg->ref_count);
- return sg;
- }
- return NULL;
-}
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
- struct gmap *sg, *new;
- unsigned long limit;
- int rc;
-
- if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) ||
- KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private))
- return ERR_PTR(-EFAULT);
- spin_lock(&parent->shadow_lock);
- sg = gmap_find_shadow(parent, asce, edat_level);
- spin_unlock(&parent->shadow_lock);
- if (sg)
- return sg;
- /* Create a new shadow gmap */
- limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
- if (asce & _ASCE_REAL_SPACE)
- limit = -1UL;
- new = gmap_alloc(limit);
- if (!new)
- return ERR_PTR(-ENOMEM);
- new->mm = parent->mm;
- new->parent = gmap_get(parent);
- new->private = parent->private;
- new->orig_asce = asce;
- new->edat_level = edat_level;
- new->initialized = false;
- spin_lock(&parent->shadow_lock);
- /* Recheck if another CPU created the same shadow */
- sg = gmap_find_shadow(parent, asce, edat_level);
- if (sg) {
- spin_unlock(&parent->shadow_lock);
- gmap_free(new);
- return sg;
- }
- if (asce & _ASCE_REAL_SPACE) {
- /* only allow one real-space gmap shadow */
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce & _ASCE_REAL_SPACE) {
- spin_lock(&sg->guest_table_lock);
- gmap_unshadow(sg);
- spin_unlock(&sg->guest_table_lock);
- list_del(&sg->list);
- gmap_put(sg);
- break;
- }
- }
- }
- refcount_set(&new->ref_count, 2);
- list_add(&new->list, &parent->children);
- if (asce & _ASCE_REAL_SPACE) {
- /* nothing to protect, return right away */
- new->initialized = true;
- spin_unlock(&parent->shadow_lock);
- return new;
- }
- spin_unlock(&parent->shadow_lock);
- /* protect after insertion, so it will get properly invalidated */
- mmap_read_lock(parent->mm);
- rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN,
- ((asce & _ASCE_TABLE_LENGTH) + 1),
- PROT_READ, GMAP_NOTIFY_SHADOW);
- mmap_read_unlock(parent->mm);
- spin_lock(&parent->shadow_lock);
- new->initialized = true;
- if (rc) {
- list_del(&new->list);
- gmap_free(new);
- new = ERR_PTR(rc);
- }
- spin_unlock(&parent->shadow_lock);
- return new;
-}
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index 6d8944d1b4a0..3c26e35af0ef 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -7,7 +7,7 @@
* Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* David Hildenbrand <david@redhat.com>
- * Janosch Frank <frankja@linux.vnet.ibm.com>
+ * Janosch Frank <frankja@linux.ibm.com>
*/
#include <linux/compiler.h>
@@ -15,107 +15,1321 @@
#include <linux/kvm_host.h>
#include <linux/pgtable.h>
#include <linux/pagemap.h>
-
#include <asm/lowcore.h>
-#include <asm/gmap.h>
#include <asm/uv.h>
+#include <asm/gmap_helpers.h>
+#include "dat.h"
#include "gmap.h"
+#include "kvm-s390.h"
+#include "faultin.h"
+
+static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
+}
+
+static int gmap_limit_to_type(gfn_t limit)
+{
+ if (!limit)
+ return TABLE_TYPE_REGION1;
+ if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
+ return TABLE_TYPE_SEGMENT;
+ if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
+ return TABLE_TYPE_REGION3;
+ if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
+ return TABLE_TYPE_REGION2;
+ return TABLE_TYPE_REGION1;
+}
+
+/**
+ * gmap_new() - Allocate and initialize a guest address space.
+ * @kvm: The kvm owning the guest.
+ * @limit: Maximum address of the gmap address space.
+ *
+ * Return: A guest address space structure.
+ */
+struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
+{
+ struct crst_table *table;
+ struct gmap *gmap;
+ int type;
+
+ type = gmap_limit_to_type(limit);
+
+ gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
+ if (!gmap)
+ return NULL;
+ INIT_LIST_HEAD(&gmap->children);
+ INIT_LIST_HEAD(&gmap->list);
+ INIT_LIST_HEAD(&gmap->scb_users);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
+ spin_lock_init(&gmap->children_lock);
+ spin_lock_init(&gmap->host_to_rmap_lock);
+ refcount_set(&gmap->refcount, 1);
+
+ table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
+ if (!table) {
+ kfree(gmap);
+ return NULL;
+ }
+
+ gmap->asce.val = __pa(table);
+ gmap->asce.dt = type;
+ gmap->asce.tl = _ASCE_TABLE_LENGTH;
+ gmap->asce.x = 1;
+ gmap->asce.p = 1;
+ gmap->asce.s = 1;
+ gmap->kvm = kvm;
+ set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
+
+ return gmap;
+}
+
+static void gmap_add_child(struct gmap *parent, struct gmap *child)
+{
+ KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
+ KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
+ KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
+ lockdep_assert_held(&parent->children_lock);
+
+ child->parent = parent;
+
+ if (is_ucontrol(parent))
+ set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
+ else
+ clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
+
+ if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
+ set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
+ else
+ clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
+
+ if (kvm_is_ucontrol(parent->kvm))
+ clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
+ list_add(&child->list, &parent->children);
+}
+
+struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
+{
+ struct gmap *res;
+
+ lockdep_assert_not_held(&parent->children_lock);
+ res = gmap_new(parent->kvm, limit);
+ if (res) {
+ scoped_guard(spinlock, &parent->children_lock)
+ gmap_add_child(parent, res);
+ }
+ return res;
+}
+
+int gmap_set_limit(struct gmap *gmap, gfn_t limit)
+{
+ struct kvm_s390_mmu_cache *mc;
+ int rc, type;
+
+ type = gmap_limit_to_type(limit);
+
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc)
+ return -ENOMEM;
+
+ do {
+ rc = kvm_s390_mmu_cache_topup(mc);
+ if (rc)
+ return rc;
+ scoped_guard(write_lock, &gmap->kvm->mmu_lock)
+ rc = dat_set_asce_limit(mc, &gmap->asce, type);
+ } while (rc == -ENOMEM);
+
+ kvm_s390_free_mmu_cache(mc);
+ return 0;
+}
+
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct vsie_rmap *rmap, *rnext, *head;
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void __rcu **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ head = radix_tree_delete(root, index);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ kfree(rmap);
+ }
+ } while (nr > 0);
+}
+
+void gmap_remove_child(struct gmap *child)
+{
+ if (KVM_BUG_ON(!child->parent, child->kvm))
+ return;
+ lockdep_assert_held(&child->parent->children_lock);
+
+ list_del(&child->list);
+ child->parent = NULL;
+ child->invalidated = true;
+}
+
+/**
+ * gmap_dispose() - Remove and free a guest address space and its children.
+ * @gmap: Pointer to the guest address space structure.
+ */
+void gmap_dispose(struct gmap *gmap)
+{
+ /* The gmap must have been removed from the parent beforehands */
+ KVM_BUG_ON(gmap->parent, gmap->kvm);
+ /* All children of this gmap must have been removed beforehands */
+ KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
+ /* No VSIE shadow block is allowed to use this gmap */
+ KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
+ /* The ASCE must be valid */
+ KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
+ /* The refcount must be 0 */
+ KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
+
+ /* Flush tlb of all gmaps */
+ asce_flush_tlb(gmap->asce);
+
+ /* Free all DAT tables. */
+ dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
+
+ /* Free additional data for a shadow gmap */
+ if (is_shadow(gmap))
+ gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+
+ kfree(gmap);
+}
+
+/**
+ * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
+ * @gmap: The gmap whose ASCE needs to be replaced.
+ *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ *
+ * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
+ * -ENOMEM if runinng out of memory.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+ struct crst_table *table;
+ union asce asce;
+
+ /* Replacing segment type ASCEs would cause serious issues */
+ if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
+ return -EINVAL;
+
+ table = dat_alloc_crst_sleepable(0);
+ if (!table)
+ return -ENOMEM;
+ memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
+
+ /* Set new table origin while preserving existing ASCE control bits */
+ asce = gmap->asce;
+ asce.rsto = virt_to_pfn(table);
+ WRITE_ONCE(gmap->asce, asce);
+
+ return 0;
+}
+
+bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
+{
+ struct kvm *kvm = gmap->kvm;
+ struct kvm_vcpu *vcpu;
+ gfn_t prefix_gfn;
+ unsigned long i;
+
+ if (is_shadow(gmap))
+ return false;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /* Match against both prefix pages */
+ prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
+ if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
+ if (hint && kvm_s390_is_in_sie(vcpu))
+ return false;
+ VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
+ gfn_to_gpa(gfn), gfn_to_gpa(end));
+ kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
+ }
+ }
+ return true;
+}
+
+struct clear_young_pte_priv {
+ struct gmap *gmap;
+ bool young;
+};
+
+static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
+{
+ struct clear_young_pte_priv *p = walk->priv;
+ union pgste pgste;
+ union pte pte, new;
+
+ pte = READ_ONCE(*ptep);
+
+ if (!pte.s.pr || (!pte.s.y && pte.h.i))
+ return 0;
+
+ pgste = pgste_get_lock(ptep);
+ if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
+ new = pte;
+ new.h.i = 1;
+ new.s.y = 0;
+ if ((new.s.d || !new.h.p) && !new.s.s)
+ folio_set_dirty(pfn_folio(pte.h.pfra));
+ new.s.d = 0;
+ new.h.p = 1;
+
+ pgste.prefix_notif = 0;
+ pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
+ }
+ p->young = 1;
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
+{
+ struct clear_young_pte_priv *priv = walk->priv;
+ union crste crste, new;
+
+ do {
+ crste = READ_ONCE(*crstep);
+
+ if (!crste.h.fc)
+ return 0;
+ if (!crste.s.fc1.y && crste.h.i)
+ return 0;
+ if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
+ break;
+
+ new = crste;
+ new.h.i = 1;
+ new.s.fc1.y = 0;
+ new.s.fc1.prefix_notif = 0;
+ if (new.s.fc1.d || !new.h.p)
+ folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
+ new.s.fc1.d = 0;
+ new.h.p = 1;
+ } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
+
+ priv->young = 1;
+ return 0;
+}
/**
- * gmap_make_secure() - make one guest page secure
- * @gmap: the guest gmap
- * @gaddr: the guest address that needs to be made secure
- * @uvcb: the UVCB specifying which operation needs to be performed
+ * gmap_age_gfn() - Clear young.
+ * @gmap: The guest gmap.
+ * @start: The first gfn to test.
+ * @end: The gfn after the last one to test.
*
- * Context: needs to be called with kvm->srcu held.
- * Return: 0 on success, < 0 in case of error.
+ * Context: Called with the kvm mmu write lock held.
+ * Return: 1 if any page in the given range was young, otherwise 0.
*/
-int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
{
- struct kvm *kvm = gmap->private;
+ const struct dat_walk_ops ops = {
+ .pte_entry = gmap_clear_young_pte,
+ .pmd_entry = gmap_clear_young_crste,
+ .pud_entry = gmap_clear_young_crste,
+ };
+ struct clear_young_pte_priv priv = {
+ .gmap = gmap,
+ .young = false,
+ };
+
+ _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
+
+ return priv.young;
+}
+
+struct gmap_unmap_priv {
+ struct gmap *gmap;
+ struct kvm_memory_slot *slot;
+};
+
+static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
+{
+ struct gmap_unmap_priv *priv = w->priv;
+ struct folio *folio = NULL;
unsigned long vmaddr;
+ union pgste pgste;
- lockdep_assert_held(&kvm->srcu);
+ pgste = pgste_get_lock(ptep);
+ if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
+ vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
+ gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
+ }
+ if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
+ folio = pfn_folio(ptep->h.pfra);
+ pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
+ pgste_set_unlock(ptep, pgste);
+ if (folio)
+ uv_convert_from_secure_folio(folio);
- vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
- if (kvm_is_error_hva(vmaddr))
- return -EFAULT;
- return make_hva_secure(gmap->mm, vmaddr, uvcb);
+ return 0;
}
-int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
{
- struct uv_cb_cts uvcb = {
- .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
- .header.len = sizeof(uvcb),
- .guest_handle = gmap->guest_handle,
- .gaddr = gaddr,
+ struct gmap_unmap_priv *priv = walk->priv;
+ struct folio *folio = NULL;
+ union crste old = *crstep;
+
+ if (!old.h.fc)
+ return 0;
+
+ if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
+ folio = phys_to_folio(crste_origin_large(old));
+ /* No races should happen because kvm->mmu_lock is held in write mode */
+ KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
+ priv->gmap->kvm);
+ if (folio)
+ uv_convert_from_secure_folio(folio);
+
+ return 0;
+}
+
+/**
+ * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
+ * @gmap: The gmap to act on.
+ * @slot: The memslot in which the range is located.
+ * @start: The first gfn to unmap.
+ * @end: The gfn after the last one to unmap.
+ *
+ * Context: Called with the kvm mmu write lock held.
+ * Return: false
+ */
+bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
+{
+ const struct dat_walk_ops ops = {
+ .pte_entry = _gmap_unmap_pte,
+ .pmd_entry = _gmap_unmap_crste,
+ .pud_entry = _gmap_unmap_crste,
};
+ struct gmap_unmap_priv priv = {
+ .gmap = gmap,
+ .slot = slot,
+ };
+
+ lockdep_assert_held_write(&gmap->kvm->mmu_lock);
+
+ _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
+ return false;
+}
+
+static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
+ struct gmap *gmap)
+{
+ union pte pte = READ_ONCE(*ptep);
+
+ if (!pte.s.pr || (pte.h.p && !pte.s.sd))
+ return pgste;
+
+ /*
+ * If this page contains one or more prefixes of vCPUS that are currently
+ * running, do not reset the protection, leave it marked as dirty.
+ */
+ if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
+ pte.h.p = 1;
+ pte.s.sd = 0;
+ pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
+ }
+
+ mark_page_dirty(gmap->kvm, gfn);
+
+ return pgste;
+}
+
+static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
+ struct dat_walk *walk)
+{
+ struct gmap *gmap = walk->priv;
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
+ struct dat_walk *walk)
+{
+ struct gmap *gmap = walk->priv;
+ union crste crste, new;
+
+ if (fatal_signal_pending(current))
+ return 1;
+ do {
+ crste = READ_ONCE(*table);
+ if (!crste.h.fc)
+ return 0;
+ if (crste.h.p && !crste.s.fc1.sd)
+ return 0;
+
+ /*
+ * If this large page contains one or more prefixes of vCPUs that are
+ * currently running, do not reset the protection, leave it marked as
+ * dirty.
+ */
+ if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
+ break;
+ new = crste;
+ new.h.p = 1;
+ new.s.fc1.sd = 0;
+ } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
+
+ for ( ; gfn < end; gfn++)
+ mark_page_dirty(gmap->kvm, gfn);
- return gmap_make_secure(gmap, gaddr, &uvcb);
+ return 0;
+}
+
+void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
+{
+ const struct dat_walk_ops walk_ops = {
+ .pte_entry = _pte_test_and_clear_softdirty,
+ .pmd_entry = _crste_test_and_clear_softdirty,
+ .pud_entry = _crste_test_and_clear_softdirty,
+ };
+
+ lockdep_assert_held(&gmap->kvm->mmu_lock);
+
+ _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
+}
+
+static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
+{
+ union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
+
+ /* Somehow the crste is not large anymore, let the slow path deal with it. */
+ if (!oldcrste.h.fc)
+ return 1;
+
+ f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
+ f->writable = oldcrste.s.fc1.w;
+
+ /* Appropriate permissions already (race with another handler), nothing to do. */
+ if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
+ return 0;
+
+ if (!f->write_attempt || oldcrste.s.fc1.w) {
+ f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
+ newcrste = oldcrste;
+ newcrste.h.i = 0;
+ newcrste.s.fc1.y = 1;
+ if (f->write_attempt) {
+ newcrste.h.p = 0;
+ newcrste.s.fc1.d = 1;
+ newcrste.s.fc1.sd = 1;
+ }
+ /* In case of races, let the slow path deal with it. */
+ return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
+ }
+ /* Trying to write on a read-only page, let the slow path deal with it. */
+ return 1;
+}
+
+static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
+ struct guest_fault *f)
+{
+ union pte newpte, oldpte = READ_ONCE(*f->ptep);
+
+ f->pfn = oldpte.h.pfra;
+ f->writable = oldpte.s.w;
+
+ /* Appropriate permissions already (race with another handler), nothing to do. */
+ if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
+ return 0;
+ /* Trying to write on a read-only page, let the slow path deal with it. */
+ if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
+ return 1;
+
+ newpte = oldpte;
+ newpte.h.i = 0;
+ newpte.s.y = 1;
+ if (f->write_attempt) {
+ newpte.h.p = 0;
+ newpte.s.d = 1;
+ newpte.s.sd = 1;
+ }
+ *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
+
+ return 0;
}
/**
- * __gmap_destroy_page() - Destroy a guest page.
- * @gmap: the gmap of the guest
- * @page: the page to destroy
+ * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
+ * @gmap: The gmap whose fault needs to be resolved.
+ * @fault: Describes the fault that is being resolved.
*
- * An attempt will be made to destroy the given guest page. If the attempt
- * fails, an attempt is made to export the page. If both attempts fail, an
- * appropriate error is returned.
+ * A minor fault is a fault that can be resolved quickly within gmap.
+ * The page is already mapped, the fault is only due to dirty/young tracking.
*
- * Context: must be called holding the mm lock for gmap->mm
+ * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
+ * not be resolved and needs to go through the slow path.
*/
-static int __gmap_destroy_page(struct gmap *gmap, struct page *page)
+int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
{
- struct folio *folio = page_folio(page);
+ union pgste pgste;
int rc;
- /*
- * See gmap_make_secure(): large folios cannot be secure. Small
- * folio implies FW_LEVEL_PTE.
- */
- if (folio_test_large(folio))
- return -EFAULT;
+ lockdep_assert_held(&gmap->kvm->mmu_lock);
- rc = uv_destroy_folio(folio);
- /*
- * Fault handlers can race; it is possible that two CPUs will fault
- * on the same secure page. One CPU can destroy the page, reboot,
- * re-enter secure mode and import it, while the second CPU was
- * stuck at the beginning of the handler. At some point the second
- * CPU will be able to progress, and it will not be able to destroy
- * the page. In that case we do not want to terminate the process,
- * we instead try to export the page.
- */
+ rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
+ &fault->crstep, &fault->ptep);
+ /* If a PTE or a leaf CRSTE could not be reached, slow path. */
if (rc)
- rc = uv_convert_from_secure_folio(folio);
+ return 1;
+ if (fault->ptep) {
+ pgste = pgste_get_lock(fault->ptep);
+ rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
+ if (!rc && fault->callback)
+ fault->callback(fault);
+ pgste_set_unlock(fault->ptep, pgste);
+ } else {
+ rc = gmap_handle_minor_crste_fault(gmap, fault);
+ if (!rc && fault->callback)
+ fault->callback(fault);
+ }
return rc;
}
+static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f,
+ struct kvm_memory_slot *slot)
+{
+ return false;
+}
+
/**
- * gmap_destroy_page() - Destroy a guest page.
- * @gmap: the gmap of the guest
- * @gaddr: the guest address to destroy
+ * gmap_1m_allowed() - Check whether a 1M hugepage is allowed.
+ * @gmap: The gmap of the guest.
+ * @f: Describes the fault that is being resolved.
+ * @slot: The memslot the faulting address belongs to.
*
- * An attempt will be made to destroy the given guest page. If the attempt
- * fails, an attempt is made to export the page. If both attempts fail, an
- * appropriate error is returned.
+ * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for
+ * @gmap, whether the offset of the address in the 1M virtual frame is the
+ * same as the offset in the physical 1M frame, and finally whether the whole
+ * 1M page would fit in the given memslot.
*
- * Context: may sleep.
+ * Return: true if a 1M hugepage is allowed to back the faulting address, false
+ * otherwise.
*/
-int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f,
+ struct kvm_memory_slot *slot)
+{
+ return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) &&
+ !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) &&
+ slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) &&
+ slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT);
+}
+
+static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
+ struct guest_fault *f)
{
- struct page *page;
+ union crste oldval, newval;
+ union pte newpte, oldpte;
+ union pgste pgste;
int rc = 0;
- mmap_read_lock(gmap->mm);
- page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr));
- if (page)
- rc = __gmap_destroy_page(gmap, page);
- kvm_release_page_clean(page);
- mmap_read_unlock(gmap->mm);
+ rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
+ &f->crstep, &f->ptep);
+ if (rc == -ENOMEM)
+ return rc;
+ if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
+ return rc;
+ if (rc)
+ return -EAGAIN;
+ if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
+ return -EINVAL;
+
+ if (f->ptep) {
+ pgste = pgste_get_lock(f->ptep);
+ oldpte = *f->ptep;
+ newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
+ newpte.s.sd = oldpte.s.sd;
+ oldpte.s.sd = 0;
+ if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
+ pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
+ if (f->callback)
+ f->callback(f);
+ } else {
+ rc = -EAGAIN;
+ }
+ pgste_set_unlock(f->ptep, pgste);
+ } else {
+ do {
+ oldval = READ_ONCE(*f->crstep);
+ newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
+ f->write_attempt | oldval.s.fc1.d);
+ newval.s.fc1.s = !f->page;
+ newval.s.fc1.sd = oldval.s.fc1.sd;
+ if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
+ crste_origin_large(oldval) != crste_origin_large(newval))
+ return -EAGAIN;
+ } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
+ if (f->callback)
+ f->callback(f);
+ }
+
return rc;
}
+
+int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f,
+ struct kvm_memory_slot *slot)
+{
+ unsigned int order;
+ int level;
+
+ lockdep_assert_held(&gmap->kvm->mmu_lock);
+
+ level = TABLE_TYPE_PAGE_TABLE;
+ if (f->page) {
+ order = folio_order(page_folio(f->page));
+ if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot))
+ level = TABLE_TYPE_REGION3;
+ else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot))
+ level = TABLE_TYPE_SEGMENT;
+ }
+ return _gmap_link(mc, gmap, level, f);
+}
+
+static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
+ gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
+{
+ union crste newcrste, oldcrste;
+ struct page_table *pt;
+ union crste *crstep;
+ union pte *ptep;
+ int rc;
+
+ if (force_alloc)
+ rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
+ TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+ else
+ rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
+ TABLE_TYPE_SEGMENT, &crstep, &ptep);
+ if (rc)
+ return rc;
+ if (!ptep) {
+ newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
+ newcrste.h.i = 1;
+ newcrste.h.fc0.tl = 1;
+ } else {
+ pt = pte_table_start(ptep);
+ dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
+ newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
+ }
+ rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
+ &crstep, &ptep);
+ if (rc)
+ return rc;
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ if (oldcrste.val == newcrste.val)
+ break;
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
+ return 0;
+}
+
+static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
+{
+ union pte *ptep;
+ int rc;
+
+ rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
+ TABLE_TYPE_SEGMENT, crstepp, &ptep);
+ if (rc || (!ptep && !crste_is_ucas(**crstepp)))
+ return -EREMOTE;
+ if (!ptep)
+ return 1;
+ *gaddr &= ~_SEGMENT_MASK;
+ *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
+ return 0;
+}
+
+/**
+ * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
+ * @mc: The memory cache to be used for allocations.
+ * @gmap: The per-cpu gmap.
+ * @gaddr: Pointer to the address to be translated, will get overwritten with
+ * the translated address in case of success.
+ * Translates the per-vCPU guest address into a fake guest address, which can
+ * then be used with the fake memslots that are identity mapping userspace.
+ * This allows ucontrol VMs to use the normal fault resolution path, like
+ * normal VMs.
+ *
+ * Return: %0 in case of success, otherwise %-EREMOTE.
+ */
+int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
+{
+ gpa_t translated_address;
+ union crste *crstep;
+ gfn_t gfn;
+ int rc;
+
+ gfn = gpa_to_gfn(*gaddr);
+
+ scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
+ rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
+ if (rc <= 0)
+ return rc;
+ }
+ do {
+ scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
+ rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
+ if (rc <= 0)
+ return rc;
+ translated_address = (*gaddr & ~_SEGMENT_MASK) |
+ (crstep->val & _SEGMENT_MASK);
+ rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
+ }
+ if (!rc) {
+ *gaddr = translated_address;
+ return 0;
+ }
+ if (rc != -ENOMEM)
+ return -EREMOTE;
+ rc = kvm_s390_mmu_cache_topup(mc);
+ if (rc)
+ return rc;
+ } while (1);
+ return 0;
+}
+
+int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
+{
+ struct kvm_s390_mmu_cache *mc;
+ int rc;
+
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc)
+ return -ENOMEM;
+
+ while (count) {
+ scoped_guard(write_lock, &gmap->kvm->mmu_lock)
+ rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
+ if (rc == -ENOMEM) {
+ rc = kvm_s390_mmu_cache_topup(mc);
+ if (rc)
+ return rc;
+ continue;
+ }
+ if (rc)
+ return rc;
+
+ count--;
+ c_gfn += _PAGE_ENTRIES;
+ p_gfn += _PAGE_ENTRIES;
+ }
+ return rc;
+}
+
+static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
+{
+ union crste *crstep;
+ union pte *ptep;
+ int rc;
+
+ rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
+ if (rc)
+ return;
+ while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
+ ;
+}
+
+void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
+{
+ guard(read_lock)(&gmap->kvm->mmu_lock);
+
+ for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
+ gmap_ucas_unmap_one(gmap, c_gfn);
+}
+
+static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct gmap *gmap = walk->priv;
+ union crste crste, newcrste;
+
+ crste = READ_ONCE(*crstep);
+ newcrste = _CRSTE_EMPTY(crste.h.tt);
+
+ while (crste_leaf(crste)) {
+ if (crste_prefix(crste))
+ gmap_unmap_prefix(gmap, gfn, next);
+ if (crste.s.fc1.vsie_notif)
+ gmap_handle_vsie_unshadow_event(gmap, gfn);
+ if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
+ break;
+ crste = READ_ONCE(*crstep);
+ }
+
+ if (need_resched())
+ return next;
+
+ return 0;
+}
+
+void gmap_split_huge_pages(struct gmap *gmap)
+{
+ const struct dat_walk_ops ops = {
+ .pmd_entry = _gmap_split_crste,
+ .pud_entry = _gmap_split_crste,
+ };
+ gfn_t start = 0;
+
+ do {
+ scoped_guard(read_lock, &gmap->kvm->mmu_lock)
+ start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
+ &ops, DAT_WALK_IGN_HOLES, gmap);
+ cond_resched();
+ } while (start);
+}
+
+static int _gmap_enable_skeys(struct gmap *gmap)
+{
+ gfn_t start = 0;
+ int rc;
+
+ if (uses_skeys(gmap))
+ return 0;
+
+ set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+ rc = gmap_helper_disable_cow_sharing();
+ if (rc) {
+ clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+ return rc;
+ }
+
+ do {
+ scoped_guard(write_lock, &gmap->kvm->mmu_lock)
+ start = dat_reset_skeys(gmap->asce, start);
+ cond_resched();
+ } while (start);
+ return 0;
+}
+
+int gmap_enable_skeys(struct gmap *gmap)
+{
+ int rc;
+
+ mmap_write_lock(gmap->kvm->mm);
+ rc = _gmap_enable_skeys(gmap);
+ mmap_write_unlock(gmap->kvm->mm);
+ return rc;
+}
+
+static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ if (!ptep->s.pr)
+ return 0;
+ __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ phys_addr_t origin, cur, end;
+
+ if (!crstep->h.fc || !crstep->s.fc1.pr)
+ return 0;
+
+ origin = crste_origin_large(*crstep);
+ cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
+ end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
+ for ( ; cur < end; cur += PAGE_SIZE)
+ __kvm_s390_pv_destroy_page(phys_to_page(cur));
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
+{
+ const struct dat_walk_ops ops = {
+ .pte_entry = _destroy_pages_pte,
+ .pmd_entry = _destroy_pages_crste,
+ .pud_entry = _destroy_pages_crste,
+ };
+
+ do {
+ scoped_guard(read_lock, &gmap->kvm->mmu_lock)
+ start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
+ DAT_WALK_IGN_HOLES, NULL);
+ if (interruptible && fatal_signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ } while (start && start < end);
+ return 0;
+}
+
+int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
+{
+ struct vsie_rmap *rmap __free(kvfree) = NULL;
+ struct vsie_rmap *temp;
+ void __rcu **slot;
+ int rc = 0;
+
+ KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+ lockdep_assert_held(&sg->host_to_rmap_lock);
+
+ rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
+ if (!rmap)
+ return -ENOMEM;
+
+ rmap->r_gfn = r_gfn;
+ rmap->level = level;
+ slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
+ if (slot) {
+ rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
+ for (temp = rmap->next; temp; temp = temp->next) {
+ if (temp->val == rmap->val)
+ return 0;
+ }
+ radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
+ } else {
+ rmap->next = NULL;
+ rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
+ if (rc)
+ return rc;
+ }
+ rmap = NULL;
+
+ return 0;
+}
+
+int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
+ kvm_pfn_t pfn, int level, bool wr)
+{
+ union crste *crstep;
+ union pgste pgste;
+ union pte *ptep;
+ union pte pte;
+ int flags, rc;
+
+ KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+ lockdep_assert_held(&sg->parent->children_lock);
+
+ flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
+ rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
+ TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+ if (rc)
+ return rc;
+ if (level <= TABLE_TYPE_REGION1) {
+ scoped_guard(spinlock, &sg->host_to_rmap_lock)
+ rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
+ }
+ if (rc)
+ return rc;
+
+ if (!pgste_get_trylock(ptep, &pgste))
+ return -EAGAIN;
+ pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
+ pte.h.p = 1;
+ pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
+ pgste.vsie_notif = 1;
+ pgste_set_unlock(ptep, pgste);
+
+ return 0;
+}
+
+static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+void gmap_set_cmma_all_dirty(struct gmap *gmap)
+{
+ const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
+ gfn_t gfn = 0;
+
+ do {
+ scoped_guard(read_lock, &gmap->kvm->mmu_lock)
+ gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
+ DAT_WALK_IGN_HOLES, NULL);
+ cond_resched();
+ } while (gfn);
+}
+
+static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
+{
+ unsigned long align = PAGE_SIZE;
+ gpa_t gaddr = gfn_to_gpa(r_gfn);
+ union crste *crstep;
+ union crste crste;
+ union pte *ptep;
+
+ if (level > TABLE_TYPE_PAGE_TABLE)
+ align = 1UL << (11 * level + _SEGMENT_SHIFT);
+ kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
+ sg->invalidated = true;
+ if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
+ return;
+ if (ptep) {
+ if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
+ dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
+ return;
+ }
+
+ crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
+ if (crste_leaf(crste) || crste.h.i)
+ return;
+ if (is_pmd(crste))
+ dat_free_pt(dereference_pmd(crste.pmd));
+ else
+ dat_free_level(dereference_crste(crste), true);
+}
+
+static void gmap_unshadow(struct gmap *sg)
+{
+ struct gmap_cache *gmap_cache, *next;
+
+ KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+ KVM_BUG_ON(!sg->parent, sg->kvm);
+
+ lockdep_assert_held(&sg->parent->children_lock);
+
+ gmap_remove_child(sg);
+ kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
+
+ list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
+ gmap_cache->gmap = NULL;
+ list_del(&gmap_cache->list);
+ }
+
+ gmap_put(sg);
+}
+
+void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
+{
+ struct vsie_rmap *rmap, *rnext, *head;
+ struct gmap *sg, *next;
+ gfn_t start, end;
+
+ list_for_each_entry_safe(sg, next, &parent->children, list) {
+ start = sg->guest_asce.rsto;
+ end = start + sg->guest_asce.tl + 1;
+ if (!sg->guest_asce.r && gfn >= start && gfn < end) {
+ gmap_unshadow(sg);
+ continue;
+ }
+ scoped_guard(spinlock, &sg->host_to_rmap_lock)
+ head = radix_tree_delete(&sg->host_to_rmap, gfn);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
+ }
+}
+
+/**
+ * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
+ * @parent: Pointer to the parent gmap.
+ * @asce: ASCE for which the shadow table is created.
+ * @edat_level: Edat level to be used for the shadow translation.
+ *
+ * Context: Called with parent->children_lock held.
+ *
+ * Return: The pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL.
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
+{
+ struct gmap *sg;
+
+ lockdep_assert_held(&parent->children_lock);
+ list_for_each_entry(sg, &parent->children, list) {
+ if (!gmap_is_shadow_valid(sg, asce, edat_level))
+ continue;
+ return sg;
+ }
+ return NULL;
+}
+
+#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
+struct gmap_protect_asce_top_level {
+ unsigned long seq;
+ struct guest_fault f[CRST_TABLE_PAGES];
+};
+
+static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+ struct gmap_protect_asce_top_level *context)
+{
+ struct gmap *parent;
+ int rc, i;
+
+ guard(write_lock)(&sg->kvm->mmu_lock);
+
+ if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
+ return -EAGAIN;
+
+ parent = READ_ONCE(sg->parent);
+ if (!parent)
+ return -EAGAIN;
+ scoped_guard(spinlock, &parent->children_lock) {
+ if (READ_ONCE(sg->parent) != parent)
+ return -EAGAIN;
+ sg->invalidated = false;
+ for (i = 0; i < CRST_TABLE_PAGES; i++) {
+ if (!context->f[i].valid)
+ continue;
+ rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
+ TABLE_TYPE_REGION1 + 1, context->f[i].writable);
+ if (rc)
+ return rc;
+ }
+ gmap_add_child(sg->parent, sg);
+ }
+
+ kvm_s390_release_faultin_array(sg->kvm, context->f, false);
+ return 0;
+}
+
+static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+ struct gmap_protect_asce_top_level *context)
+{
+ int rc;
+
+ if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
+ return -EAGAIN;
+ do {
+ rc = kvm_s390_mmu_cache_topup(mc);
+ if (rc)
+ return rc;
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc)
+ return rc;
+ rc = __gmap_protect_asce_top_level(mc, sg, context);
+ radix_tree_preload_end();
+ } while (rc == -ENOMEM);
+
+ return rc;
+}
+
+static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
+{
+ struct gmap_protect_asce_top_level context = {};
+ union asce asce = sg->guest_asce;
+ int rc;
+
+ KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+
+ context.seq = sg->kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
+
+ rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
+ if (rc > 0)
+ rc = -EFAULT;
+ if (!rc)
+ rc = _gmap_protect_asce_top_level(mc, sg, &context);
+ if (rc)
+ kvm_s390_release_faultin_array(sg->kvm, context.f, true);
+ return rc;
+}
+
+/**
+ * gmap_create_shadow() - Create/find a shadow guest address space.
+ * @mc: The cache to use to allocate dat tables.
+ * @parent: Pointer to the parent gmap.
+ * @asce: ASCE for which the shadow table is created.
+ * @edat_level: Edat level to be used for the shadow translation.
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * The returned shadow gmap will be returned with one extra reference.
+ *
+ * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
+ union asce asce, int edat_level)
+{
+ struct gmap *sg, *new;
+ int rc;
+
+ if (WARN_ON(!parent))
+ return ERR_PTR(-EINVAL);
+
+ scoped_guard(spinlock, &parent->children_lock) {
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ gmap_get(sg);
+ return sg;
+ }
+ }
+ /* Create a new shadow gmap. */
+ new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->guest_asce = asce;
+ new->edat_level = edat_level;
+ set_bit(GMAP_FLAG_SHADOW, &new->flags);
+
+ scoped_guard(spinlock, &parent->children_lock) {
+ /* Recheck if another CPU created the same shadow. */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ gmap_put(new);
+ gmap_get(sg);
+ return sg;
+ }
+ if (asce.r) {
+ /* Only allow one real-space gmap shadow. */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->guest_asce.r) {
+ scoped_guard(write_lock, &parent->kvm->mmu_lock)
+ gmap_unshadow(sg);
+ break;
+ }
+ }
+ gmap_add_child(parent, new);
+ /* Nothing to protect, return right away. */
+ gmap_get(new);
+ return new;
+ }
+ }
+
+ gmap_get(new);
+ new->parent = parent;
+ /* Protect while inserting, protects against invalidation races. */
+ rc = gmap_protect_asce_top_level(mc, new);
+ if (rc) {
+ new->parent = NULL;
+ gmap_put(new);
+ gmap_put(new);
+ return ERR_PTR(rc);
+ }
+ return new;
+}
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
index c8f031c9ea5f..96ee1395a592 100644
--- a/arch/s390/kvm/gmap.h
+++ b/arch/s390/kvm/gmap.h
@@ -10,30 +10,246 @@
#ifndef ARCH_KVM_S390_GMAP_H
#define ARCH_KVM_S390_GMAP_H
-#define GMAP_SHADOW_FAKE_TABLE 1ULL
+#include "dat.h"
-int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
-int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
-int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
+/**
+ * enum gmap_flags - Flags of a gmap.
+ *
+ * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap.
+ * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0
+ * only for ucontrol per-cpu gmaps, since they
+ * share the page tables with the main gmap.
+ * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap).
+ * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap,
+ * independently of the page size used by userspace.
+ * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap,
+ * independently of the page size used by userspace.
+ * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap.
+ * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys.
+ * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA.
+ * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping.
+ */
+enum gmap_flags {
+ GMAP_FLAG_SHADOW = 0,
+ GMAP_FLAG_OWNS_PAGETABLES,
+ GMAP_FLAG_IS_UCONTROL,
+ GMAP_FLAG_ALLOW_HPAGE_1M,
+ GMAP_FLAG_ALLOW_HPAGE_2G,
+ GMAP_FLAG_PFAULT_ENABLED,
+ GMAP_FLAG_USES_SKEYS,
+ GMAP_FLAG_USES_CMM,
+ GMAP_FLAG_EXPORT_ON_UNMAP,
+};
/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- * given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
+ * struct gmap_struct - Guest address space.
*
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
+ * @flags: GMAP_FLAG_* flags.
+ * @edat_level: The edat level of this shadow gmap.
+ * @kvm: The vm.
+ * @asce: The ASCE used by this gmap.
+ * @list: List head used in children gmaps for the children gmap list.
+ * @children_lock: Protects children and scb_users.
+ * @children: List of child gmaps of this gmap.
+ * @scb_users: List of vsie_scb that use this shadow gmap.
+ * @parent: Parent gmap of a child gmap.
+ * @guest_asce: Original ASCE of this shadow gmap.
+ * @host_to_rmap_lock: Protects host_to_rmap.
+ * @host_to_rmap: Radix tree mapping host addresses to guest addresses.
+ */
+struct gmap {
+ unsigned long flags;
+ unsigned char edat_level;
+ bool invalidated;
+ struct kvm *kvm;
+ union asce asce;
+ struct list_head list;
+ spinlock_t children_lock; /* Protects: children, scb_users */
+ struct list_head children;
+ struct list_head scb_users;
+ struct gmap *parent;
+ union asce guest_asce;
+ spinlock_t host_to_rmap_lock; /* Protects host_to_rmap */
+ struct radix_tree_root host_to_rmap;
+ refcount_t refcount;
+};
+
+struct gmap_cache {
+ struct list_head list;
+ struct gmap *gmap;
+};
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+ for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+int s390_replace_asce(struct gmap *gmap);
+bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint);
+bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end);
+bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
+int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault);
+struct gmap *gmap_new(struct kvm *kvm, gfn_t limit);
+struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit);
+void gmap_remove_child(struct gmap *child);
+void gmap_dispose(struct gmap *gmap);
+int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault,
+ struct kvm_memory_slot *slot);
+void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end);
+int gmap_set_limit(struct gmap *gmap, gfn_t limit);
+int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr);
+int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count);
+void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count);
+int gmap_enable_skeys(struct gmap *gmap);
+int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible);
+int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level);
+int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
+ kvm_pfn_t pfn, int level, bool wr);
+void gmap_set_cmma_all_dirty(struct gmap *gmap);
+void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn);
+struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
+ union asce asce, int edat_level);
+void gmap_split_huge_pages(struct gmap *gmap);
+
+static inline bool uses_skeys(struct gmap *gmap)
+{
+ return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+}
+
+static inline bool uses_cmm(struct gmap *gmap)
+{
+ return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags);
+}
+
+static inline bool pfault_enabled(struct gmap *gmap)
+{
+ return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags);
+}
+
+static inline bool is_ucontrol(struct gmap *gmap)
+{
+ return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags);
+}
+
+static inline bool is_shadow(struct gmap *gmap)
+{
+ return test_bit(GMAP_FLAG_SHADOW, &gmap->flags);
+}
+
+static inline bool owns_page_tables(struct gmap *gmap)
+{
+ return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
+}
+
+static inline struct gmap *gmap_put(struct gmap *gmap)
+{
+ if (refcount_dec_and_test(&gmap->refcount))
+ gmap_dispose(gmap);
+ return NULL;
+}
+
+static inline void gmap_get(struct gmap *gmap)
+{
+ WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount)));
+}
+
+static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
+{
+ scoped_guard(spinlock, &parent->children_lock)
+ _gmap_handle_vsie_unshadow_event(parent, gfn);
+}
+
+static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
+{
+ return _gmap_unmap_prefix(gmap, gfn, end, true);
+}
+
+static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
+{
+ return _gmap_unmap_prefix(gmap, gfn, end, false);
+}
+
+static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
+ union pgste pgste, gfn_t gfn, bool needs_lock)
+{
+ lockdep_assert_held(&gmap->kvm->mmu_lock);
+ if (!needs_lock)
+ lockdep_assert_held(&gmap->children_lock);
+ else
+ lockdep_assert_not_held(&gmap->children_lock);
+
+ if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) {
+ pgste.prefix_notif = 0;
+ gmap_unmap_prefix(gmap, gfn, gfn + 1);
+ }
+ if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) {
+ pgste.vsie_notif = 0;
+ if (needs_lock)
+ gmap_handle_vsie_unshadow_event(gmap, gfn);
+ else
+ _gmap_handle_vsie_unshadow_event(gmap, gfn);
+ }
+ if (!ptep->s.d && newpte.s.d && !newpte.s.s)
+ SetPageDirty(pfn_to_page(newpte.h.pfra));
+ return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
+}
+
+static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
+ union pgste pgste, gfn_t gfn)
+{
+ return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
+}
+
+static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
+ union crste oldcrste, union crste newcrste,
+ gfn_t gfn, bool needs_lock)
+{
+ unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES;
+
+ if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm))
+ return true;
+
+ lockdep_assert_held(&gmap->kvm->mmu_lock);
+ if (!needs_lock)
+ lockdep_assert_held(&gmap->children_lock);
+
+ gfn = ALIGN_DOWN(gfn, align);
+ if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) {
+ newcrste.s.fc1.prefix_notif = 0;
+ gmap_unmap_prefix(gmap, gfn, gfn + align);
+ }
+ if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif &&
+ (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) {
+ newcrste.s.fc1.vsie_notif = 0;
+ if (needs_lock)
+ gmap_handle_vsie_unshadow_event(gmap, gfn);
+ else
+ _gmap_handle_vsie_unshadow_event(gmap, gfn);
+ }
+ if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s)
+ SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
+ return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
+}
+
+static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
+ union crste oldcrste, union crste newcrste,
+ gfn_t gfn)
+{
+ return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true);
+}
+
+/**
+ * gmap_is_shadow_valid() - check if a shadow guest address space matches the
+ * given properties and is still valid.
+ * @sg: Pointer to the shadow guest address space structure.
+ * @asce: ASCE for which the shadow table is requested.
+ * @edat_level: Edat level to be used for the shadow translation.
*
+ * Return: true if the gmap shadow is still valid and matches the given
+ * properties and the caller can continue using it; false otherwise, the
+ * caller has to request a new shadow gmap in this case.
*/
-static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level)
{
- if (sg->removed)
- return 0;
- return sg->orig_asce == asce && sg->edat_level == edat_level;
+ return sg->guest_asce.val == asce.val && sg->edat_level == edat_level;
}
-#endif
+#endif /* ARCH_KVM_S390_GMAP_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 80879fc73c90..69835e1d4f20 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -232,18 +232,14 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
}
if (nr_wp > 0) {
- wp_info = kmalloc_array(nr_wp,
- sizeof(*wp_info),
- GFP_KERNEL_ACCOUNT);
+ wp_info = kmalloc_objs(*wp_info, nr_wp, GFP_KERNEL_ACCOUNT);
if (!wp_info) {
ret = -ENOMEM;
goto error;
}
}
if (nr_bp > 0) {
- bp_info = kmalloc_array(nr_bp,
- sizeof(*bp_info),
- GFP_KERNEL_ACCOUNT);
+ bp_info = kmalloc_objs(*bp_info, nr_bp, GFP_KERNEL_ACCOUNT);
if (!bp_info) {
ret = -ENOMEM;
goto error;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 610dd44a948b..39aff324203e 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -21,7 +21,7 @@
#include "gaccess.h"
#include "trace.h"
#include "trace-s390.h"
-#include "gmap.h"
+#include "faultin.h"
u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
{
@@ -95,7 +95,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
vcpu->stat.exit_validity++;
trace_kvm_s390_intercept_validity(vcpu, viwhy);
- KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy,
+ KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy,
current->pid, vcpu->kvm);
/* do not warn on invalid runtime instrumentation mode */
@@ -368,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg2, &srcaddr, GACC_FETCH, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0);
- if (rc != 0)
+
+ do {
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false);
+ } while (rc == -EAGAIN);
+ if (rc)
return rc;
/* Ensure that the source is paged-in, no actual access -> no key checking */
@@ -377,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg1, &dstaddr, GACC_STORE, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE);
- if (rc != 0)
+
+ do {
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true);
+ } while (rc == -EAGAIN);
+ if (rc)
return rc;
kvm_s390_retry_instr(vcpu);
@@ -472,6 +478,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->ipa == 0xb256)
return handle_sthyi(vcpu);
+ if (vcpu->kvm->arch.user_operexec)
+ return -EOPNOTSUPP;
+
if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
return -EOPNOTSUPP;
rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
@@ -545,7 +554,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu)
guest_uvcb->header.cmd);
return 0;
}
- rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+ rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb);
/*
* If the unpin did not succeed, the guest will exit again for the UVC
* and we will retry the unpin.
@@ -653,10 +662,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
break;
case ICPT_PV_PREF:
rc = 0;
- gmap_convert_to_secure(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu));
- gmap_convert_to_secure(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
+ kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu));
+ kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
break;
default:
return -EOPNOTSUPP;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 2811a6c093b8..07f59c3b9a7b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -7,13 +7,13 @@
* Author(s): Carsten Otte <cotte@de.ibm.com>
*/
-#define KMSG_COMPONENT "kvm-s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "kvm-s390: " fmt
#include <linux/cpufeature.h>
#include <linux/interrupt.h>
#include <linux/kvm_host.h>
#include <linux/hrtimer.h>
+#include <linux/export.h>
#include <linux/mmu_context.h>
#include <linux/nospec.h>
#include <linux/signal.h>
@@ -26,7 +26,6 @@
#include <linux/uaccess.h>
#include <asm/sclp.h>
#include <asm/isc.h>
-#include <asm/gmap.h>
#include <asm/nmi.h>
#include <asm/airq.h>
#include <asm/tpi.h>
@@ -34,6 +33,7 @@
#include "gaccess.h"
#include "trace-s390.h"
#include "pci.h"
+#include "gmap.h"
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
@@ -44,70 +44,34 @@ static struct kvm_s390_gib *gib;
/* handle external calls via sigp interpretation facility */
static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
{
- int c, scn;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl;
if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND))
return 0;
BUG_ON(!kvm_s390_use_sca_entries());
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl sigp_ctrl =
- sca->cpu[vcpu->vcpu_id].sigp_ctrl;
-
- c = sigp_ctrl.c;
- scn = sigp_ctrl.scn;
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl sigp_ctrl =
- sca->cpu[vcpu->vcpu_id].sigp_ctrl;
-
- c = sigp_ctrl.c;
- scn = sigp_ctrl.scn;
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
if (src_id)
- *src_id = scn;
+ *src_id = sigp_ctrl.scn;
- return c;
+ return sigp_ctrl.c;
}
static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+ union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1};
int expect, rc;
BUG_ON(!kvm_s390_use_sca_entries());
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union esca_sigp_ctrl new_val = {0}, old_val;
-
- old_val = READ_ONCE(*sigp_ctrl);
- new_val.scn = src_id;
- new_val.c = 1;
- old_val.c = 0;
-
- expect = old_val.value;
- rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union bsca_sigp_ctrl new_val = {0}, old_val;
- old_val = READ_ONCE(*sigp_ctrl);
- new_val.scn = src_id;
- new_val.c = 1;
- old_val.c = 0;
+ old_val = READ_ONCE(*sigp_ctrl);
+ old_val.c = 0;
- expect = old_val.value;
- rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ expect = old_val.value;
+ rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
if (rc != expect) {
/* another external call is pending */
@@ -119,24 +83,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+
if (!kvm_s390_use_sca_entries())
return;
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- WRITE_ONCE(sigp_ctrl->value, 0);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
-
- WRITE_ONCE(sigp_ctrl->value, 0);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ WRITE_ONCE(sigp_ctrl->value, 0);
}
int psw_extint_disabled(struct kvm_vcpu *vcpu)
@@ -1002,6 +956,9 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs);
spin_unlock(&fi->lock);
+ if (!ext.ext_params)
+ return 0;
+
VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x",
ext.ext_params);
vcpu->stat.deliver_service_signal++;
@@ -1223,7 +1180,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- if (!sclp.has_sigpif)
+ if (!kvm_s390_use_sca_entries())
return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
return sca_ext_call_pending(vcpu, NULL);
@@ -1322,6 +1279,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
no_timer:
kvm_vcpu_srcu_read_unlock(vcpu);
+ vcpu->kvm->arch.float_int.last_sleep_cpu = vcpu->vcpu_idx;
kvm_vcpu_halt(vcpu);
vcpu->valid_wakeup = false;
__unset_cpu_idle(vcpu);
@@ -1547,7 +1505,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
return -EINVAL;
- if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
+ if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu))
return sca_inject_ext_call(vcpu, src_id);
if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
@@ -1794,7 +1752,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
goto out;
}
gisa_out:
- tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
+ tmp_inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT);
if (tmp_inti) {
tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
tmp_inti->io.io_int_word = isc_to_int_word(isc);
@@ -1948,18 +1906,15 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
if (!online_vcpus)
return;
- /* find idle VCPUs first, then round robin */
- sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus);
- if (sigcpu == online_vcpus) {
- do {
- sigcpu = kvm->arch.float_int.next_rr_cpu++;
- kvm->arch.float_int.next_rr_cpu %= online_vcpus;
- /* avoid endless loops if all vcpus are stopped */
- if (nr_tries++ >= online_vcpus)
- return;
- } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu)));
+ for (sigcpu = kvm->arch.float_int.last_sleep_cpu; ; sigcpu++) {
+ sigcpu %= online_vcpus;
+ dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+ if (!is_vcpu_stopped(dst_vcpu))
+ break;
+ /* avoid endless loops if all vcpus are stopped */
+ if (nr_tries++ >= online_vcpus)
+ return;
}
- dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
/* make the VCPU drop out of the SIE, or wake it up if sleeping */
switch (type) {
@@ -2016,7 +1971,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt_info *inti;
int rc;
- inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
+ inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -2422,7 +2377,7 @@ static int enqueue_floating_irq(struct kvm_device *dev,
return -EINVAL;
while (len >= sizeof(struct kvm_s390_irq)) {
- inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT);
+ inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT);
if (!inti)
return -ENOMEM;
@@ -2470,7 +2425,7 @@ static int register_io_adapter(struct kvm_device *dev,
if (dev->kvm->arch.adapters[adapter_info.id] != NULL)
return -EINVAL;
- adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT);
+ adapter = kzalloc_obj(*adapter, GFP_KERNEL_ACCOUNT);
if (!adapter)
return -ENOMEM;
@@ -2680,12 +2635,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
case KVM_DEV_FLIC_APF_ENABLE:
if (kvm_is_ucontrol(dev->kvm))
return -EINVAL;
- dev->kvm->arch.gmap->pfault_enabled = 1;
+ set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags);
break;
case KVM_DEV_FLIC_APF_DISABLE_WAIT:
if (kvm_is_ucontrol(dev->kvm))
return -EINVAL;
- dev->kvm->arch.gmap->pfault_enabled = 0;
+ clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags);
/*
* Make sure no async faults are in transition when
* clearing the queues. So we don't need to worry
@@ -2772,17 +2727,27 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
bit = bit_nr + (addr % PAGE_SIZE) * 8;
+ /* kvm_set_routing_entry() should never allow this to happen */
+ WARN_ON_ONCE(bit > (PAGE_SIZE * BITS_PER_BYTE - 1));
+
return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
}
static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
{
+ struct mm_struct *mm = kvm->mm;
struct page *page = NULL;
+ int locked = 1;
+
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+ get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
+ &page, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
- mmap_read_lock(kvm->mm);
- get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
- &page, NULL);
- mmap_read_unlock(kvm->mm);
return page;
}
@@ -2809,13 +2774,13 @@ static int adapter_indicators_set(struct kvm *kvm,
bit = get_ind_bit(adapter_int->ind_addr,
adapter_int->ind_offset, adapter->swap);
set_bit(bit, map);
- mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+ mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT);
set_page_dirty_lock(ind_page);
map = page_address(summary_page);
bit = get_ind_bit(adapter_int->summary_addr,
adapter_int->summary_offset, adapter->swap);
summary_set = test_and_set_bit(bit, map);
- mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+ mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT);
set_page_dirty_lock(summary_page);
srcu_read_unlock(&kvm->srcu, idx);
@@ -2865,6 +2830,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
int rc;
mci.val = mcck_info->mcic;
+
+ /* log machine checks being reinjected on all debugs */
+ VCPU_EVENT(vcpu, 2, "guest machine check %lx", mci.val);
+ KVM_EVENT(2, "guest machine check %lx", mci.val);
+ pr_info("guest machine check pid %d: %lx", current->pid, mci.val);
+
if (mci.sr)
cr14 |= CR14_RECOVERY_SUBMASK;
if (mci.dg)
@@ -2893,6 +2864,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
+ const struct kvm_irq_routing_s390_adapter *adapter;
u64 uaddr_s, uaddr_i;
int idx;
@@ -2903,6 +2875,14 @@ int kvm_set_routing_entry(struct kvm *kvm,
return -EINVAL;
e->set = set_adapter_int;
+ adapter = &ue->u.adapter;
+ if (adapter->summary_addr + (adapter->summary_offset / 8) >=
+ (adapter->summary_addr & PAGE_MASK) + PAGE_SIZE)
+ return -EINVAL;
+ if (adapter->ind_addr + (adapter->ind_offset / 8) >=
+ (adapter->ind_addr & PAGE_MASK) + PAGE_SIZE)
+ return -EINVAL;
+
idx = srcu_read_lock(&kvm->srcu);
uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr);
uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr);
@@ -2911,7 +2891,9 @@ int kvm_set_routing_entry(struct kvm *kvm,
if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i))
return -EFAULT;
e->adapter.summary_addr = uaddr_s;
+ e->adapter.summary_gaddr = ue->u.adapter.summary_addr;
e->adapter.ind_addr = uaddr_i;
+ e->adapter.ind_gaddr = ue->u.adapter.ind_addr;
e->adapter.summary_offset = ue->u.adapter.summary_offset;
e->adapter.ind_offset = ue->u.adapter.ind_offset;
e->adapter.adapter_id = ue->u.adapter.adapter_id;
@@ -3161,7 +3143,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm)
if (!gi->origin)
return;
gisa_clear_ipm(gi->origin);
- VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin);
+ VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin);
}
void kvm_s390_gisa_init(struct kvm *kvm)
@@ -3177,7 +3159,7 @@ void kvm_s390_gisa_init(struct kvm *kvm)
hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
- VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
+ VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin);
}
void kvm_s390_gisa_enable(struct kvm *kvm)
@@ -3218,7 +3200,7 @@ void kvm_s390_gisa_destroy(struct kvm *kvm)
process_gib_alert_list();
hrtimer_cancel(&gi->timer);
gi->origin = NULL;
- VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa);
+ VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa);
}
void kvm_s390_gisa_disable(struct kvm *kvm)
@@ -3467,7 +3449,7 @@ int __init kvm_s390_gib_init(u8 nisc)
}
}
- KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
+ KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc);
goto out;
out_unreg_gal:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fff863734975..e09960c2e6ed 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -10,10 +10,11 @@
* Jason J. Herne <jjherne@us.ibm.com>
*/
-#define KMSG_COMPONENT "kvm-s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "kvm-s390: " fmt
#include <linux/compiler.h>
+#include <linux/entry-virt.h>
+#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/hrtimer.h>
@@ -39,7 +40,7 @@
#include <asm/lowcore.h>
#include <asm/machine.h>
#include <asm/stp.h>
-#include <asm/gmap.h>
+#include <asm/gmap_helpers.h>
#include <asm/nmi.h>
#include <asm/isc.h>
#include <asm/sclp.h>
@@ -51,8 +52,9 @@
#include <asm/uv.h>
#include "kvm-s390.h"
#include "gaccess.h"
-#include "pci.h"
#include "gmap.h"
+#include "faultin.h"
+#include "pci.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -63,7 +65,7 @@
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
-const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+const struct kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, inject_io),
STATS_DESC_COUNTER(VM, inject_float_mchk),
@@ -89,7 +91,7 @@ const struct kvm_stats_header kvm_vm_stats_header = {
sizeof(kvm_vm_stats_desc),
};
-const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+const struct kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, exit_userspace),
STATS_DESC_COUNTER(VCPU, exit_null),
@@ -184,7 +186,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
- STATS_DESC_COUNTER(VCPU, pfault_sync)
+ STATS_DESC_COUNTER(VCPU, pfault_sync),
+ STATS_DESC_COUNTER(VCPU, signal_exits)
};
const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -262,17 +265,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
/* available subfunctions indicated via query / "test bit" */
static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
-static struct gmap_notifier gmap_notifier;
-static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
debug_info_t *kvm_s390_dbf_uv;
/* Section: not file related */
/* forward declarations */
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end);
-static int sca_switch_to_extended(struct kvm *kvm);
-
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
{
u8 delta_idx = 0;
@@ -355,7 +352,7 @@ static __always_inline void pfcr_query(u8 (*query)[16])
{
asm volatile(
" lghi 0,0\n"
- " .insn rsy,0xeb0000000016,0,0,%[query]\n"
+ " .insn rsy,0xeb0000000016,0,0,%[query]"
: [query] "=QS" (*query)
:
: "cc", "0");
@@ -367,7 +364,7 @@ static __always_inline void __sortl_query(u8 (*query)[32])
" lghi 0,0\n"
" la 1,%[query]\n"
/* Parameter registers are ignored */
- " .insn rre,0xb9380000,2,4\n"
+ " .insn rre,0xb9380000,2,4"
: [query] "=R" (*query)
:
: "cc", "0", "1");
@@ -379,7 +376,7 @@ static __always_inline void __dfltcc_query(u8 (*query)[32])
" lghi 0,0\n"
" la 1,%[query]\n"
/* Parameter registers are ignored */
- " .insn rrf,0xb9390000,2,4,6,0\n"
+ " .insn rrf,0xb9390000,2,4,6,0"
: [query] "=R" (*query)
:
: "cc", "0", "1");
@@ -528,10 +525,6 @@ static int __init __kvm_s390_init(void)
if (rc)
goto err_gib;
- gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_pte_notifier(&gmap_notifier);
- vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
- gmap_register_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_register(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
@@ -551,8 +544,6 @@ err_kvm_uv:
static void __kvm_s390_exit(void)
{
- gmap_unregister_pte_notifier(&gmap_notifier);
- gmap_unregister_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
@@ -563,12 +554,43 @@ static void __kvm_s390_exit(void)
debug_unregister(kvm_s390_dbf_uv);
}
+static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op,
+ unsigned long addr, union skey skey)
+{
+ union asce asce = kvm->arch.gmap->asce;
+ gfn_t gfn = gpa_to_gfn(addr);
+ int r;
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ switch (op) {
+ case KVM_S390_KEYOP_SSKE:
+ r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0);
+ if (r >= 0)
+ return skey.skey;
+ break;
+ case KVM_S390_KEYOP_ISKE:
+ r = dat_get_storage_key(asce, gfn, &skey);
+ if (!r)
+ return skey.skey;
+ break;
+ case KVM_S390_KEYOP_RRBE:
+ r = dat_reset_reference_bit(asce, gfn);
+ if (r > 0)
+ return r << 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return r;
+}
+
/* Section: device related */
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
if (ioctl == KVM_S390_ENABLE_SIE)
- return s390_enable_sie();
+ return 0;
return -EINVAL;
}
@@ -579,7 +601,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
switch (ext) {
case KVM_CAP_S390_PSW:
case KVM_CAP_S390_GMAP:
- case KVM_CAP_SYNC_MMU:
#ifdef CONFIG_KVM_S390_UCONTROL
case KVM_CAP_S390_UCONTROL:
#endif
@@ -606,6 +627,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_S390_USER_OPEREXEC:
+ case KVM_CAP_S390_KEYOP:
+ case KVM_CAP_S390_VSIE_ESAMODE:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -631,11 +655,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_S390_BSCA_CPU_SLOTS;
+ /*
+ * Return the same value for KVM_CAP_MAX_VCPUS and
+ * KVM_CAP_MAX_VCPU_ID to conform with the KVM API.
+ */
+ r = KVM_S390_ESCA_CPU_SLOTS;
if (!kvm_s390_use_sca_entries())
r = KVM_MAX_VCPUS;
- else if (sclp.has_esca && sclp.has_64bscao)
- r = KVM_S390_ESCA_CPU_SLOTS;
if (ext == KVM_CAP_NR_VCPUS)
r = min_t(unsigned int, num_online_cpus(), r);
break;
@@ -694,32 +720,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
- int i;
- gfn_t cur_gfn, last_gfn;
- unsigned long gaddr, vmaddr;
- struct gmap *gmap = kvm->arch.gmap;
- DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
-
- /* Loop over all guest segments */
- cur_gfn = memslot->base_gfn;
- last_gfn = memslot->base_gfn + memslot->npages;
- for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
- gaddr = gfn_to_gpa(cur_gfn);
- vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
- if (kvm_is_error_hva(vmaddr))
- continue;
-
- bitmap_zero(bitmap, _PAGE_ENTRIES);
- gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
- for (i = 0; i < _PAGE_ENTRIES; i++) {
- if (test_bit(i, bitmap))
- mark_page_dirty(kvm, cur_gfn + i);
- }
+ gfn_t last_gfn = memslot->base_gfn + memslot->npages;
- if (fatal_signal_pending(current))
- return;
- cond_resched();
- }
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn);
}
/* Section: vm related */
@@ -879,9 +883,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
r = -EINVAL;
else {
r = 0;
- mmap_write_lock(kvm->mm);
- kvm->mm->context.allow_gmap_hpage_1m = 1;
- mmap_write_unlock(kvm->mm);
+ set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags);
/*
* We might have to create fake 4k page
* tables. To avoid that the hardware works on
@@ -919,6 +921,17 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
r ? "(not available)" : "(success)");
break;
+ case KVM_CAP_S390_USER_OPEREXEC:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC");
+ kvm->arch.user_operexec = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
+ case KVM_CAP_S390_VSIE_ESAMODE:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_VSIE_ESAMODE");
+ kvm->arch.allow_vsie_esamode = 1;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -948,7 +961,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att
static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
- unsigned int idx;
+
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
ret = -ENXIO;
@@ -959,8 +972,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
ret = -EBUSY;
- else if (kvm->mm->context.allow_gmap_hpage_1m)
- ret = -EINVAL;
else {
kvm->arch.use_cmma = 1;
/* Not compatible with cmma. */
@@ -969,7 +980,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
}
mutex_unlock(&kvm->lock);
break;
- case KVM_S390_VM_MEM_CLR_CMMA:
+ case KVM_S390_VM_MEM_CLR_CMMA: {
+ gfn_t start_gfn = 0;
+
ret = -ENXIO;
if (!sclp.has_cmma)
break;
@@ -978,13 +991,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
break;
VM_EVENT(kvm, 3, "%s", "RESET: CMMA states");
- mutex_lock(&kvm->lock);
- idx = srcu_read_lock(&kvm->srcu);
- s390_reset_cmma(kvm->arch.gmap->mm);
- srcu_read_unlock(&kvm->srcu, idx);
- mutex_unlock(&kvm->lock);
+ do {
+ start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn);
+ cond_resched();
+ } while (start_gfn);
ret = 0;
break;
+ }
case KVM_S390_VM_MEM_LIMIT_SIZE: {
unsigned long new_limit;
@@ -1001,29 +1014,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- /* gmap_create takes last usable address */
- if (new_limit != KVM_S390_NO_MEM_LIMIT)
- new_limit -= 1;
-
ret = -EBUSY;
- mutex_lock(&kvm->lock);
- if (!kvm->created_vcpus) {
- /* gmap_create will round the limit up */
- struct gmap *new = gmap_create(current->mm, new_limit);
-
- if (!new) {
- ret = -ENOMEM;
- } else {
- gmap_remove(kvm->arch.gmap);
- new->private = kvm;
- kvm->arch.gmap = new;
- ret = 0;
- }
- }
- mutex_unlock(&kvm->lock);
+ if (!kvm->created_vcpus)
+ ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit));
VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
- VM_EVENT(kvm, 3, "New guest asce: 0x%pK",
- (void *) kvm->arch.gmap->asce);
+ VM_EVENT(kvm, 3, "New guest asce: 0x%p",
+ (void *)kvm->arch.gmap->asce.val);
break;
}
default:
@@ -1188,19 +1184,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
kvm->arch.migration_mode = 1;
return 0;
}
- /* mark all the pages in active slots as dirty */
kvm_for_each_memslot(ms, bkt, slots) {
if (!ms->dirty_bitmap)
return -EINVAL;
- /*
- * The second half of the bitmap is only used on x86,
- * and would be wasted otherwise, so we put it to good
- * use here to keep track of the state of the storage
- * attributes.
- */
- memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
ram_pages += ms->npages;
}
+ /* mark all the pages as dirty */
+ gmap_set_cmma_all_dirty(kvm->arch.gmap);
atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
kvm->arch.migration_mode = 1;
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
@@ -1433,7 +1423,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -EBUSY;
goto out;
}
- proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
+ proc = kzalloc_obj(*proc, GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1633,7 +1623,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_processor *proc;
int ret = 0;
- proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT);
+ proc = kzalloc_obj(*proc, GFP_KERNEL_ACCOUNT);
if (!proc) {
ret = -ENOMEM;
goto out;
@@ -1661,7 +1651,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
struct kvm_s390_vm_cpu_machine *mach;
int ret = 0;
- mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT);
+ mach = kzalloc_obj(*mach, GFP_KERNEL_ACCOUNT);
if (!mach) {
ret = -ENOMEM;
goto out;
@@ -1930,22 +1920,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
* Updates the Multiprocessor Topology-Change-Report bit to signal
* the guest with a topology change.
* This is only relevant if the topology facility is present.
- *
- * The SCA version, bsca or esca, doesn't matter as offset is the same.
*/
static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
{
union sca_utility new, old;
- struct bsca_block *sca;
+ struct esca_block *sca;
- read_lock(&kvm->arch.sca_lock);
sca = kvm->arch.sca;
old = READ_ONCE(sca->utility);
do {
new = old;
new.mtcr = val;
} while (!try_cmpxchg(&sca->utility.val, &old.val, new.val));
- read_unlock(&kvm->arch.sca_lock);
}
static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
@@ -1966,9 +1952,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
if (!test_kvm_facility(kvm, 11))
return -ENXIO;
- read_lock(&kvm->arch.sca_lock);
- topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
- read_unlock(&kvm->arch.sca_lock);
+ topo = kvm->arch.sca->utility.mtcr;
return put_user(topo, (u8 __user *)attr->addr);
}
@@ -2112,40 +2096,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
- uint8_t *keys;
- uint64_t hva;
- int srcu_idx, i, r = 0;
+ union skey *keys;
+ int i, r = 0;
if (args->flags != 0)
return -EINVAL;
/* Is this guest using storage keys? */
- if (!mm_uses_skeys(current->mm))
+ if (!uses_skeys(kvm->arch.gmap))
return KVM_S390_GET_SKEYS_NONE;
/* Enforce sane limit on memory allocation */
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
+ keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
- mmap_read_lock(current->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < args->count; i++) {
- hva = gfn_to_hva(kvm, args->start_gfn + i);
- if (kvm_is_error_hva(hva)) {
- r = -EFAULT;
- break;
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ for (i = 0; i < args->count; i++) {
+ r = dat_get_storage_key(kvm->arch.gmap->asce,
+ args->start_gfn + i, keys + i);
+ if (r)
+ break;
}
-
- r = get_guest_storage_key(current->mm, hva, &keys[i]);
- if (r)
- break;
}
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(current->mm);
if (!r) {
r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
@@ -2160,10 +2136,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
- uint8_t *keys;
- uint64_t hva;
- int srcu_idx, i, r = 0;
- bool unlocked;
+ struct kvm_s390_mmu_cache *mc;
+ union skey *keys;
+ int i, r = 0;
if (args->flags != 0)
return -EINVAL;
@@ -2172,7 +2147,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
+ keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
@@ -2184,159 +2159,41 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
}
/* Enable storage key handling for the guest */
- r = s390_enable_skey();
+ r = gmap_enable_skeys(kvm->arch.gmap);
if (r)
goto out;
- i = 0;
- mmap_read_lock(current->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- while (i < args->count) {
- unlocked = false;
- hva = gfn_to_hva(kvm, args->start_gfn + i);
- if (kvm_is_error_hva(hva)) {
- r = -EFAULT;
- break;
- }
-
+ r = -EINVAL;
+ for (i = 0; i < args->count; i++) {
/* Lowest order bit is reserved */
- if (keys[i] & 0x01) {
- r = -EINVAL;
- break;
- }
-
- r = set_guest_storage_key(current->mm, hva, keys[i], 0);
- if (r) {
- r = fixup_user_fault(current->mm, hva,
- FAULT_FLAG_WRITE, &unlocked);
- if (r)
- break;
- }
- if (!r)
- i++;
- }
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(current->mm);
-out:
- kvfree(keys);
- return r;
-}
-
-/*
- * Base address and length must be sent at the start of each block, therefore
- * it's cheaper to send some clean data, as long as it's less than the size of
- * two longs.
- */
-#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
-/* for consistency */
-#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
-
-static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
- u8 *res, unsigned long bufsize)
-{
- unsigned long pgstev, hva, cur_gfn = args->start_gfn;
-
- args->count = 0;
- while (args->count < bufsize) {
- hva = gfn_to_hva(kvm, cur_gfn);
- /*
- * We return an error if the first value was invalid, but we
- * return successfully if at least one value was copied.
- */
- if (kvm_is_error_hva(hva))
- return args->count ? 0 : -EFAULT;
- if (get_pgste(kvm->mm, hva, &pgstev) < 0)
- pgstev = 0;
- res[args->count++] = (pgstev >> 24) & 0x43;
- cur_gfn++;
- }
-
- return 0;
-}
-
-static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots,
- gfn_t gfn)
-{
- return ____gfn_to_memslot(slots, gfn, true);
-}
-
-static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
- unsigned long cur_gfn)
-{
- struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
- unsigned long ofs = cur_gfn - ms->base_gfn;
- struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
-
- if (ms->base_gfn + ms->npages <= cur_gfn) {
- mnode = rb_next(mnode);
- /* If we are above the highest slot, wrap around */
- if (!mnode)
- mnode = rb_first(&slots->gfn_tree);
-
- ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
- ofs = 0;
+ if (keys[i].zero)
+ goto out;
}
- if (cur_gfn < ms->base_gfn)
- ofs = 0;
-
- ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
- while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
- ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
- ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc) {
+ r = -ENOMEM;
+ goto out;
}
- return ms->base_gfn + ofs;
-}
-
-static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
- u8 *res, unsigned long bufsize)
-{
- unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
- struct kvm_memslots *slots = kvm_memslots(kvm);
- struct kvm_memory_slot *ms;
- if (unlikely(kvm_memslots_empty(slots)))
- return 0;
-
- cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
- ms = gfn_to_memslot(kvm, cur_gfn);
- args->count = 0;
- args->start_gfn = cur_gfn;
- if (!ms)
- return 0;
- next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
- mem_end = kvm_s390_get_gfn_end(slots);
-
- while (args->count < bufsize) {
- hva = gfn_to_hva(kvm, cur_gfn);
- if (kvm_is_error_hva(hva))
- return 0;
- /* Decrement only if we actually flipped the bit to 0 */
- if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
- atomic64_dec(&kvm->arch.cmma_dirty_pages);
- if (get_pgste(kvm->mm, hva, &pgstev) < 0)
- pgstev = 0;
- /* Save the value */
- res[args->count++] = (pgstev >> 24) & 0x43;
- /* If the next bit is too far away, stop. */
- if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
- return 0;
- /* If we reached the previous "next", find the next one */
- if (cur_gfn == next_gfn)
- next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
- /* Reached the end of memory or of the buffer, stop */
- if ((next_gfn >= mem_end) ||
- (next_gfn - args->start_gfn >= bufsize))
- return 0;
- cur_gfn++;
- /* Reached the end of the current memslot, take the next one. */
- if (cur_gfn - ms->base_gfn >= ms->npages) {
- ms = gfn_to_memslot(kvm, cur_gfn);
- if (!ms)
- return 0;
+ r = 0;
+ do {
+ r = kvm_s390_mmu_cache_topup(mc);
+ if (r == -ENOMEM)
+ break;
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ for (i = 0 ; i < args->count; i++) {
+ r = dat_set_storage_key(mc, kvm->arch.gmap->asce,
+ args->start_gfn + i, keys[i], 0);
+ if (r)
+ break;
+ }
}
- }
- return 0;
+ } while (r == -ENOMEM);
+ kvm_s390_free_mmu_cache(mc);
+out:
+ kvfree(keys);
+ return r;
}
/*
@@ -2350,8 +2207,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
static int kvm_s390_get_cmma_bits(struct kvm *kvm,
struct kvm_s390_cmma_log *args)
{
- unsigned long bufsize;
- int srcu_idx, peek, ret;
+ int peek, ret;
u8 *values;
if (!kvm->arch.use_cmma)
@@ -2364,8 +2220,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
if (!peek && !kvm->arch.migration_mode)
return -EINVAL;
/* CMMA is disabled or was not used, or the buffer has length zero */
- bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
- if (!bufsize || !kvm->mm->context.uses_cmm) {
+ args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX);
+ if (!args->count || !uses_cmm(kvm->arch.gmap)) {
memset(args, 0, sizeof(*args));
return 0;
}
@@ -2375,18 +2231,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
return 0;
}
- values = vmalloc(bufsize);
+ values = vmalloc(args->count);
if (!values)
return -ENOMEM;
- mmap_read_lock(kvm->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- if (peek)
- ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
- else
- ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(kvm->mm);
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ if (peek)
+ ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count,
+ values);
+ else
+ ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count,
+ values, &kvm->arch.cmma_dirty_pages);
+ }
if (kvm->arch.migration_mode)
args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
@@ -2408,11 +2264,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
static int kvm_s390_set_cmma_bits(struct kvm *kvm,
const struct kvm_s390_cmma_log *args)
{
- unsigned long hva, mask, pgstev, i;
- uint8_t *bits;
- int srcu_idx, r = 0;
-
- mask = args->mask;
+ struct kvm_s390_mmu_cache *mc;
+ u8 *bits = NULL;
+ int r = 0;
if (!kvm->arch.use_cmma)
return -ENXIO;
@@ -2426,9 +2280,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
if (args->count == 0)
return 0;
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc)
+ return -ENOMEM;
bits = vmalloc(array_size(sizeof(*bits), args->count));
if (!bits)
- return -ENOMEM;
+ goto out;
r = copy_from_user(bits, (void __user *)args->values, args->count);
if (r) {
@@ -2436,29 +2293,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
goto out;
}
- mmap_read_lock(kvm->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < args->count; i++) {
- hva = gfn_to_hva(kvm, args->start_gfn + i);
- if (kvm_is_error_hva(hva)) {
- r = -EFAULT;
+ do {
+ r = kvm_s390_mmu_cache_topup(mc);
+ if (r)
break;
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn,
+ args->count, args->mask, bits);
}
+ } while (r == -ENOMEM);
- pgstev = bits[i];
- pgstev = pgstev << 24;
- mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT;
- set_pgste_bits(kvm->mm, hva, mask, pgstev);
- }
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(kvm->mm);
-
- if (!kvm->mm->context.uses_cmm) {
- mmap_write_lock(kvm->mm);
- kvm->mm->context.uses_cmm = 1;
- mmap_write_unlock(kvm->mm);
- }
+ set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags);
out:
+ kvm_s390_free_mmu_cache(mc);
vfree(bits);
return r;
}
@@ -2666,15 +2513,16 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (kvm_s390_pv_is_protected(kvm))
break;
+ mmap_write_lock(kvm->mm);
/*
- * FMT 4 SIE needs esca. As we never switch back to bsca from
- * esca, we need no cleanup in the error cases below
+ * Disable creation of new THPs. Existing THPs can stay, they
+ * will be split when any part of them gets imported.
*/
- r = sca_switch_to_extended(kvm);
- if (r)
- break;
-
- r = s390_disable_cow_sharing();
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm);
+ mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm);
+ set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags);
+ r = gmap_helper_disable_cow_sharing();
+ mmap_write_unlock(kvm->mm);
if (r)
break;
@@ -2746,9 +2594,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (copy_from_user(&parms, argp, sizeof(parms)))
break;
- /* Currently restricted to 8KB */
+ /* Currently restricted to 1MiB */
r = -EINVAL;
- if (parms.length > PAGE_SIZE * 2)
+ if (parms.length > SZ_1M)
break;
r = -ENOMEM;
@@ -2902,9 +2750,9 @@ static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_fla
static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ void *tmpbuf __free(kvfree) = NULL;
enum gacc_mode acc_mode;
- void *tmpbuf = NULL;
- int r, srcu_idx;
+ int r;
r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION |
KVM_S390_MEMOP_F_CHECK_ONLY);
@@ -2917,52 +2765,32 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
return -ENOMEM;
}
- srcu_idx = srcu_read_lock(&kvm->srcu);
+ acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
- if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
- r = PGM_ADDRESSING;
- goto out_unlock;
- }
+ scoped_guard(srcu, &kvm->srcu) {
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)
+ return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
- acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
- goto out_unlock;
- }
- if (acc_mode == GACC_FETCH) {
+ if (acc_mode == GACC_STORE && copy_from_user(tmpbuf, uaddr, mop->size))
+ return -EFAULT;
r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
- mop->size, GACC_FETCH, mop->key);
+ mop->size, acc_mode, mop->key);
if (r)
- goto out_unlock;
- if (copy_to_user(uaddr, tmpbuf, mop->size))
- r = -EFAULT;
- } else {
- if (copy_from_user(tmpbuf, uaddr, mop->size)) {
- r = -EFAULT;
- goto out_unlock;
- }
- r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
- mop->size, GACC_STORE, mop->key);
+ return r;
+ if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size))
+ return -EFAULT;
}
-
-out_unlock:
- srcu_read_unlock(&kvm->srcu, srcu_idx);
-
- vfree(tmpbuf);
- return r;
+ return 0;
}
static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
void __user *old_addr = (void __user *)mop->old_addr;
- union {
- __uint128_t quad;
- char raw[sizeof(__uint128_t)];
- } old = { .quad = 0}, new = { .quad = 0 };
- unsigned int off_in_quad = sizeof(new) - mop->size;
- int r, srcu_idx;
- bool success;
+ union kvm_s390_quad old = { .sixteen = 0 };
+ union kvm_s390_quad new = { .sixteen = 0 };
+ bool success = false;
+ int r;
r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION);
if (r)
@@ -2974,25 +2802,18 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m
*/
if (mop->size > sizeof(new))
return -EINVAL;
- if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+ if (copy_from_user(&new, uaddr, mop->size))
return -EFAULT;
- if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size))
+ if (copy_from_user(&old, old_addr, mop->size))
return -EFAULT;
- srcu_idx = srcu_read_lock(&kvm->srcu);
+ scoped_guard(srcu, &kvm->srcu) {
+ r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new,
+ mop->key, &success);
- if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
- r = PGM_ADDRESSING;
- goto out_unlock;
+ if (!success && copy_to_user(old_addr, &old, mop->size))
+ return -EFAULT;
}
-
- r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad,
- new.quad, mop->key, &success);
- if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size))
- r = -EFAULT;
-
-out_unlock:
- srcu_read_unlock(&kvm->srcu, srcu_idx);
return r;
}
@@ -3147,6 +2968,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
r = -EFAULT;
break;
}
+ case KVM_S390_KEYOP: {
+ struct kvm_s390_mmu_cache *mc;
+ struct kvm_s390_keyop kop;
+ union skey skey;
+
+ if (copy_from_user(&kop, argp, sizeof(kop))) {
+ r = -EFAULT;
+ break;
+ }
+ skey.skey = kop.key;
+
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc)
+ return -ENOMEM;
+
+ r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey);
+ kvm_s390_free_mmu_cache(mc);
+ if (r < 0)
+ break;
+
+ kop.key = r;
+ r = 0;
+ if (copy_to_user(argp, &kop, sizeof(kop)))
+ r = -EFAULT;
+ break;
+ }
case KVM_S390_ZPCI_OP: {
struct kvm_s390_zpci_op args;
@@ -3314,10 +3161,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm)
static void sca_dispose(struct kvm *kvm)
{
- if (kvm->arch.use_esca)
- free_pages_exact(kvm->arch.sca, sizeof(struct esca_block));
- else
- free_page((unsigned long)(kvm->arch.sca));
+ free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca));
kvm->arch.sca = NULL;
}
@@ -3331,10 +3175,11 @@ void kvm_arch_free_vm(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
- int i, rc;
+ gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
char debug_name[16];
- static unsigned long sca_offset;
+ int i, rc;
+
+ mutex_init(&kvm->arch.pv.import_lock);
rc = -EINVAL;
#ifdef CONFIG_KVM_S390_UCONTROL
@@ -3346,29 +3191,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (type)
goto out_err;
#endif
-
- rc = s390_enable_sie();
- if (rc)
- goto out_err;
-
rc = -ENOMEM;
if (!sclp.has_64bscao)
alloc_flags |= GFP_DMA;
- rwlock_init(&kvm->arch.sca_lock);
- /* start with basic SCA */
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
- if (!kvm->arch.sca)
- goto out_err;
mutex_lock(&kvm_lock);
- sca_offset += 16;
- if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
- sca_offset = 0;
- kvm->arch.sca = (struct bsca_block *)
- ((char *) kvm->arch.sca + sca_offset);
+
+ kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags);
mutex_unlock(&kvm_lock);
+ if (!kvm->arch.sca)
+ goto out_err;
- sprintf(debug_name, "kvm-%u", current->pid);
+ snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid);
kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long));
if (!kvm->arch.dbf)
@@ -3430,6 +3264,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
VM_EVENT(kvm, 3, "vm created with type %lu", type);
+ kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1;
+ kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit));
+ if (!kvm->arch.gmap)
+ goto out_err;
+ clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags);
+
if (type & KVM_VM_S390_UCONTROL) {
struct kvm_userspace_memory_region2 fake_memslot = {
.slot = KVM_S390_UCONTROL_MEMSLOT,
@@ -3439,23 +3279,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
.flags = 0,
};
- kvm->arch.gmap = NULL;
- kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
/* one flat fake memslot covering the whole address-space */
mutex_lock(&kvm->slots_lock);
KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm);
mutex_unlock(&kvm->slots_lock);
+ set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags);
} else {
- if (sclp.hamax == U64_MAX)
- kvm->arch.mem_limit = TASK_SIZE_MAX;
- else
- kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX,
- sclp.hamax + 1);
- kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
- if (!kvm->arch.gmap)
- goto out_err;
- kvm->arch.gmap->private = kvm;
- kvm->arch.gmap->pfault_enabled = 0;
+ struct crst_table *table = dereference_asce(kvm->arch.gmap->asce);
+
+ crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val);
}
kvm->arch.use_pfmfi = sclp.has_pfmfi;
@@ -3466,7 +3298,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_s390_gisa_init(kvm);
INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
kvm->arch.pv.set_aside = NULL;
- KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
+ KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid);
return 0;
out_err:
@@ -3489,8 +3321,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
sca_del_vcpu(vcpu);
kvm_s390_update_topology_change_report(vcpu->kvm, 1);
- if (kvm_is_ucontrol(vcpu->kvm))
- gmap_remove(vcpu->arch.gmap);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock)
+ gmap_remove_child(vcpu->arch.gmap);
+ vcpu->arch.gmap = gmap_put(vcpu->arch.gmap);
+ }
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -3498,6 +3333,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (kvm_s390_pv_cpu_get_handle(vcpu))
kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
free_page((unsigned long)(vcpu->arch.sie_block));
+ kvm_s390_free_mmu_cache(vcpu->arch.mc);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -3524,154 +3360,48 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
- if (!kvm_is_ucontrol(kvm))
- gmap_remove(kvm->arch.gmap);
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
kvm_s390_vsie_destroy(kvm);
- KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
+ kvm->arch.gmap = gmap_put(kvm->arch.gmap);
+ KVM_EVENT(3, "vm 0x%p destroyed", kvm);
}
/* Section: vcpu related */
-static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.gmap = gmap_create(current->mm, -1UL);
- if (!vcpu->arch.gmap)
- return -ENOMEM;
- vcpu->arch.gmap->private = vcpu->kvm;
-
- return 0;
-}
-
static void sca_del_vcpu(struct kvm_vcpu *vcpu)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+
if (!kvm_s390_use_sca_entries())
return;
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
-
- clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
- sca->cpu[vcpu->vcpu_id].sda = 0;
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
- sca->cpu[vcpu->vcpu_id].sda = 0;
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn);
+ sca->cpu[vcpu->vcpu_id].sda = 0;
}
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
- if (!kvm_s390_use_sca_entries()) {
- phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
-
- /* we still need the basic sca for the ipte control */
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys;
- return;
- }
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- phys_addr_t sca_phys = virt_to_phys(sca);
-
- sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
- vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- phys_addr_t sca_phys = virt_to_phys(sca);
-
- sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys;
- set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
-}
-
-/* Basic SCA to Extended SCA data copy routines */
-static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s)
-{
- d->sda = s->sda;
- d->sigp_ctrl.c = s->sigp_ctrl.c;
- d->sigp_ctrl.scn = s->sigp_ctrl.scn;
-}
-
-static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s)
-{
- int i;
-
- d->ipte_control = s->ipte_control;
- d->mcn[0] = s->mcn;
- for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++)
- sca_copy_entry(&d->cpu[i], &s->cpu[i]);
-}
-
-static int sca_switch_to_extended(struct kvm *kvm)
-{
- struct bsca_block *old_sca = kvm->arch.sca;
- struct esca_block *new_sca;
- struct kvm_vcpu *vcpu;
- unsigned long vcpu_idx;
- u32 scaol, scaoh;
- phys_addr_t new_sca_phys;
-
- if (kvm->arch.use_esca)
- return 0;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!new_sca)
- return -ENOMEM;
-
- new_sca_phys = virt_to_phys(new_sca);
- scaoh = new_sca_phys >> 32;
- scaol = new_sca_phys & ESCA_SCAOL_MASK;
-
- kvm_s390_vcpu_block_all(kvm);
- write_lock(&kvm->arch.sca_lock);
-
- sca_copy_b_to_e(new_sca, old_sca);
-
- kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) {
- vcpu->arch.sie_block->scaoh = scaoh;
- vcpu->arch.sie_block->scaol = scaol;
- vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- }
- kvm->arch.sca = new_sca;
- kvm->arch.use_esca = 1;
+ /* we still need the sca header for the ipte control */
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
+ vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- write_unlock(&kvm->arch.sca_lock);
- kvm_s390_vcpu_unblock_all(kvm);
-
- free_page((unsigned long)old_sca);
+ if (!kvm_s390_use_sca_entries())
+ return;
- VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)",
- old_sca, kvm->arch.sca);
- return 0;
+ set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn);
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
}
static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
{
- int rc;
-
- if (!kvm_s390_use_sca_entries()) {
- if (id < KVM_MAX_VCPUS)
- return true;
- return false;
- }
- if (id < KVM_S390_BSCA_CPU_SLOTS)
- return true;
- if (!sclp.has_esca || !sclp.has_64bscao)
- return false;
-
- rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
+ if (!kvm_s390_use_sca_entries())
+ return id < KVM_MAX_VCPUS;
- return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
+ return id < KVM_S390_ESCA_CPU_SLOTS;
}
/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
@@ -3917,7 +3647,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->eca |= ECA_IB;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= ECA_SII;
- if (sclp.has_sigpif)
+ if (kvm_s390_use_sca_entries())
vcpu->arch.sie_block->eca |= ECA_SIGPI;
if (test_kvm_facility(vcpu->kvm, 129)) {
vcpu->arch.sie_block->eca |= ECA_VX;
@@ -3978,9 +3708,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
int rc;
BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
+ vcpu->arch.mc = kvm_s390_new_mmu_cache();
+ if (!vcpu->arch.mc)
+ return -ENOMEM;
sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!sie_page)
+ if (!sie_page) {
+ kvm_s390_free_mmu_cache(vcpu->arch.mc);
+ vcpu->arch.mc = NULL;
return -ENOMEM;
+ }
vcpu->arch.sie_block = &sie_page->sie_block;
vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
@@ -4022,12 +3758,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
if (kvm_is_ucontrol(vcpu->kvm)) {
- rc = __kvm_ucontrol_vcpu_init(vcpu);
- if (rc)
+ rc = -ENOMEM;
+ vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL);
+ if (!vcpu->arch.gmap)
goto out_free_sie_block;
}
- VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK",
+ VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p",
vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
@@ -4039,8 +3776,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
out_ucontrol_uninit:
- if (kvm_is_ucontrol(vcpu->kvm))
- gmap_remove(vcpu->arch.gmap);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ gmap_remove_child(vcpu->arch.gmap);
+ vcpu->arch.gmap = gmap_put(vcpu->arch.gmap);
+ }
out_free_sie_block:
free_page((unsigned long)(vcpu->arch.sie_block));
return rc;
@@ -4104,32 +3843,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
kvm_s390_vcpu_request(vcpu);
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end)
-{
- struct kvm *kvm = gmap->private;
- struct kvm_vcpu *vcpu;
- unsigned long prefix;
- unsigned long i;
-
- trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap));
-
- if (gmap_is_shadow(gmap))
- return;
- if (start >= 1UL << 31)
- /* We are only interested in prefix pages */
- return;
- kvm_for_each_vcpu(i, vcpu, kvm) {
- /* match against both prefix pages */
- prefix = kvm_s390_get_prefix(vcpu);
- if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
- VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
- start, end);
- kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
- }
- }
-}
-
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
/* do not poll with more than halt_poll_max_steal percent of steal time */
@@ -4364,8 +4077,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- int ret = 0;
-
vcpu_load(vcpu);
vcpu->run->s.regs.fpc = fpu->fpc;
@@ -4376,7 +4087,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
vcpu_put(vcpu);
- return ret;
+ return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -4513,72 +4224,41 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
}
-static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags)
-{
- struct kvm *kvm = gmap->private;
- gfn_t gfn = gpa_to_gfn(gaddr);
- bool unlocked;
- hva_t vmaddr;
- gpa_t tmp;
- int rc;
-
- if (kvm_is_ucontrol(kvm)) {
- tmp = __gmap_translate(gmap, gaddr);
- gfn = gpa_to_gfn(tmp);
- }
-
- vmaddr = gfn_to_hva(kvm, gfn);
- rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
- if (!rc)
- rc = __gmap_link(gmap, gaddr, vmaddr);
- return rc;
-}
-
-/**
- * __kvm_s390_mprotect_many() - Apply specified protection to guest pages
- * @gmap: the gmap of the guest
- * @gpa: the starting guest address
- * @npages: how many pages to protect
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: pgste notification bits to set
- *
- * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one()
- *
- * Context: kvm->srcu and gmap->mm need to be held in read mode
- */
-int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
- unsigned long bits)
+static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr)
{
- unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
- gpa_t end = gpa + npages * PAGE_SIZE;
int rc;
- for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) {
- rc = gmap_protect_one(gmap, gpa, prot, bits);
- if (rc == -EAGAIN) {
- __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag);
- rc = gmap_protect_one(gmap, gpa, prot, bits);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr);
+ if (rc == -EREMOTE) {
+ vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
+ vcpu->run->s390_ucontrol.trans_exc_code = *gaddr;
+ vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
}
- if (rc < 0)
- return rc;
+ return rc;
}
-
return 0;
}
-static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu)
+static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu)
{
gpa_t gaddr = kvm_s390_get_prefix(vcpu);
- int idx, rc;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- mmap_read_lock(vcpu->arch.gmap->mm);
+ gfn_t gfn;
+ int rc;
- rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT);
+ if (vcpu_ucontrol_translate(vcpu, &gaddr))
+ return -EREMOTE;
+ gfn = gpa_to_gfn(gaddr);
- mmap_read_unlock(vcpu->arch.gmap->mm);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true);
+ if (rc)
+ return rc;
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true);
+ if (rc)
+ return rc;
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
+ rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn);
return rc;
}
@@ -4598,7 +4278,7 @@ retry:
if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
int rc;
- rc = kvm_s390_mprotect_notify_prefix(vcpu);
+ rc = kvm_s390_fixup_prefix(vcpu);
if (rc) {
kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
return rc;
@@ -4647,8 +4327,7 @@ retry:
* Re-enable CMM virtualization if CMMA is available and
* CMM has been used.
*/
- if ((vcpu->kvm->arch.use_cmma) &&
- (vcpu->kvm->mm->context.uses_cmm))
+ if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap))
vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
goto retry;
}
@@ -4744,7 +4423,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
return true;
}
-static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
{
hva_t hva;
struct kvm_arch_async_pf arch;
@@ -4760,7 +4439,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
return false;
if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
return false;
- if (!vcpu->arch.gmap->pfault_enabled)
+ if (!pfault_enabled(vcpu->arch.gmap))
return false;
hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
@@ -4784,9 +4463,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14];
vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15];
- if (need_resched())
- schedule();
-
if (!kvm_is_ucontrol(vcpu->kvm)) {
rc = kvm_s390_deliver_pending_interrupts(vcpu);
if (rc || guestdbg_exit_pending(vcpu))
@@ -4856,109 +4532,36 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
current->thread.gmap_int_code, current->thread.gmap_teid.val);
}
-/*
- * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu
- * @vcpu: the vCPU whose gmap is to be fixed up
- * @gfn: the guest frame number used for memslots (including fake memslots)
- * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
- * @flags: FOLL_* flags
- *
- * Return: 0 on success, < 0 in case of error.
- * Context: The mm lock must not be held before calling. May sleep.
- */
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags)
-{
- struct kvm_memory_slot *slot;
- unsigned int fault_flags;
- bool writable, unlocked;
- unsigned long vmaddr;
- struct page *page;
- kvm_pfn_t pfn;
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr)
+{
+ struct guest_fault f = {
+ .write_attempt = wr,
+ .attempt_pfault = pfault_enabled(vcpu->arch.gmap),
+ };
int rc;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return vcpu_post_run_addressing_exception(vcpu);
-
- fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
- if (vcpu->arch.gmap->pfault_enabled)
- flags |= FOLL_NOWAIT;
- vmaddr = __gfn_to_hva_memslot(slot, gfn);
-
-try_again:
- pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page);
+ if (vcpu_ucontrol_translate(vcpu, &gaddr))
+ return -EREMOTE;
+ f.gfn = gpa_to_gfn(gaddr);
- /* Access outside memory, inject addressing exception */
- if (is_noslot_pfn(pfn))
+ rc = kvm_s390_faultin_gfn(vcpu, NULL, &f);
+ if (rc <= 0)
+ return rc;
+ if (rc == PGM_ADDRESSING)
return vcpu_post_run_addressing_exception(vcpu);
- /* Signal pending: try again */
- if (pfn == KVM_PFN_ERR_SIGPENDING)
- return -EAGAIN;
-
- /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */
- if (pfn == KVM_PFN_ERR_NEEDS_IO) {
- trace_kvm_s390_major_guest_pfault(vcpu);
- if (kvm_arch_setup_async_pf(vcpu))
- return 0;
- vcpu->stat.pfault_sync++;
- /* Could not setup async pfault, try again synchronously */
- flags &= ~FOLL_NOWAIT;
- goto try_again;
- }
- /* Any other error */
- if (is_error_pfn(pfn))
- return -EFAULT;
-
- /* Success */
- mmap_read_lock(vcpu->arch.gmap->mm);
- /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */
- rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked);
- if (!rc)
- rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr);
- scoped_guard(spinlock, &vcpu->kvm->mmu_lock) {
- kvm_release_faultin_page(vcpu->kvm, page, false, writable);
- }
- mmap_read_unlock(vcpu->arch.gmap->mm);
- return rc;
-}
-
-static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags)
-{
- unsigned long gaddr_tmp;
- gfn_t gfn;
-
- gfn = gpa_to_gfn(gaddr);
- if (kvm_is_ucontrol(vcpu->kvm)) {
- /*
- * This translates the per-vCPU guest address into a
- * fake guest address, which can then be used with the
- * fake memslots that are identity mapping userspace.
- * This allows ucontrol VMs to use the normal fault
- * resolution path, like normal VMs.
- */
- mmap_read_lock(vcpu->arch.gmap->mm);
- gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr);
- mmap_read_unlock(vcpu->arch.gmap->mm);
- if (gaddr_tmp == -EFAULT) {
- vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
- vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
- vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
- return -EREMOTE;
- }
- gfn = gpa_to_gfn(gaddr_tmp);
- }
- return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags);
+ KVM_BUG_ON(rc, vcpu->kvm);
+ return -EINVAL;
}
static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
{
- unsigned int flags = 0;
+ unsigned int foll = 0;
unsigned long gaddr;
int rc;
gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
if (kvm_s390_cur_gmap_fault_is_write())
- flags = FAULT_FLAG_WRITE;
+ foll = FOLL_WRITE;
switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
case 0:
@@ -4973,7 +4576,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
* previous protected guest. The old pages need to be destroyed
* so the new guest can use them.
*/
- if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) {
+ if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) {
/*
* Either KVM messed up the secure guest mapping or the
* same page is mapped into multiple secure guests.
@@ -4995,12 +4598,12 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
* guest has not been imported yet. Try to import the page into
* the protected guest.
*/
- rc = gmap_convert_to_secure(vcpu->arch.gmap, gaddr);
+ rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr);
if (rc == -EINVAL)
send_sig(SIGSEGV, current, 0);
if (rc != -ENXIO)
break;
- flags = FAULT_FLAG_WRITE;
+ foll = FOLL_WRITE;
fallthrough;
case PGM_PROTECTION:
case PGM_SEGMENT_TRANSLATION:
@@ -5010,7 +4613,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
case PGM_REGION_SECOND_TRANS:
case PGM_REGION_THIRD_TRANS:
kvm_s390_assert_primary_as(vcpu);
- return vcpu_dat_fault_handler(vcpu, gaddr, flags);
+ return vcpu_dat_fault_handler(vcpu, gaddr, foll);
default:
KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
current->thread.gmap_int_code, current->thread.gmap_teid.val);
@@ -5020,7 +4623,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
return 0;
}
-static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
+static int vcpu_post_run(struct kvm_vcpu *vcpu, int sie_return)
{
struct mcck_volatile_info *mcck_info;
struct sie_page *sie_page;
@@ -5036,14 +4639,14 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14;
vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15;
- if (exit_reason == -EINTR) {
- VCPU_EVENT(vcpu, 3, "%s", "machine check");
+ if (sie_return == SIE64_RETURN_MCCK) {
sie_page = container_of(vcpu->arch.sie_block,
struct sie_page, sie_block);
mcck_info = &sie_page->mcck_info;
kvm_s390_reinject_machine_check(vcpu, mcck_info);
return 0;
}
+ WARN_ON_ONCE(sie_return != SIE64_RETURN_NORMAL);
if (vcpu->arch.sie_block->icptcode > 0) {
rc = kvm_handle_sie_intercept(vcpu);
@@ -5060,10 +4663,29 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
return vcpu_post_run_handle_fault(vcpu);
}
+int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
+ u64 *gprs, unsigned long gasce)
+{
+ int ret;
+
+ guest_state_enter_irqoff();
+
+ /*
+ * The guest_state_{enter,exit}_irqoff() functions inform lockdep and
+ * tracing that entry to the guest will enable host IRQs, and exit from
+ * the guest will disable host IRQs.
+ */
+ ret = sie64a(scb, gprs, gasce);
+
+ guest_state_exit_irqoff();
+
+ return ret;
+}
+
#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
- int rc, exit_reason;
+ int rc, sie_return;
struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block;
/*
@@ -5072,28 +4694,45 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
*/
kvm_vcpu_srcu_read_lock(vcpu);
- do {
+ while (true) {
rc = vcpu_pre_run(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
if (rc || guestdbg_exit_pending(vcpu))
break;
- kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
- * guest_enter and guest_exit should be no uaccess.
+ * guest_timing_enter_irqoff and guest_timing_exit_irqoff
+ * should be no uaccess.
*/
- local_irq_disable();
- guest_enter_irqoff();
- __disable_cpu_timer_accounting(vcpu);
- local_irq_enable();
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(sie_page->pv_grregs,
vcpu->run->s.regs.gprs,
sizeof(sie_page->pv_grregs));
}
- exit_reason = sie64a(vcpu->arch.sie_block,
- vcpu->run->s.regs.gprs,
- vcpu->arch.gmap->asce);
+
+xfer_to_guest_mode_check:
+ local_irq_disable();
+ xfer_to_guest_mode_prepare();
+ if (xfer_to_guest_mode_work_pending()) {
+ local_irq_enable();
+ rc = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ if (rc)
+ break;
+ goto xfer_to_guest_mode_check;
+ }
+
+ guest_timing_enter_irqoff();
+ __disable_cpu_timer_accounting(vcpu);
+
+ sie_return = kvm_s390_enter_exit_sie(vcpu->arch.sie_block,
+ vcpu->run->s.regs.gprs,
+ vcpu->arch.gmap->asce.val);
+
+ __enable_cpu_timer_accounting(vcpu);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(vcpu->run->s.regs.gprs,
sie_page->pv_grregs,
@@ -5109,16 +4748,15 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
}
}
- local_irq_disable();
- __enable_cpu_timer_accounting(vcpu);
- guest_exit_irqoff();
- local_irq_enable();
kvm_vcpu_srcu_read_lock(vcpu);
- rc = vcpu_post_run(vcpu, exit_reason);
- } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
+ rc = vcpu_post_run(vcpu, sie_return);
+ if (rc || guestdbg_exit_pending(vcpu)) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ break;
+ }
+ }
- kvm_vcpu_srcu_read_unlock(vcpu);
return rc;
}
@@ -5334,6 +4972,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (signal_pending(current) && !rc) {
kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->stat.signal_exits++;
rc = -EINTR;
}
@@ -5623,8 +5262,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ void *tmpbuf __free(kvfree) = NULL;
enum gacc_mode acc_mode;
- void *tmpbuf = NULL;
int r;
r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
@@ -5646,32 +5285,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
acc_mode, mop->key);
- goto out_inject;
- }
- if (acc_mode == GACC_FETCH) {
+ } else if (acc_mode == GACC_FETCH) {
r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
- if (r)
- goto out_inject;
- if (copy_to_user(uaddr, tmpbuf, mop->size)) {
- r = -EFAULT;
- goto out_free;
- }
+ if (!r && copy_to_user(uaddr, tmpbuf, mop->size))
+ return -EFAULT;
} else {
- if (copy_from_user(tmpbuf, uaddr, mop->size)) {
- r = -EFAULT;
- goto out_free;
- }
+ if (copy_from_user(tmpbuf, uaddr, mop->size))
+ return -EFAULT;
r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
}
-out_inject:
if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-out_free:
- vfree(tmpbuf);
return r;
}
@@ -5700,8 +5328,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
return r;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
@@ -5861,44 +5489,58 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#ifdef CONFIG_KVM_S390_UCONTROL
case KVM_S390_UCAS_MAP: {
- struct kvm_s390_ucas_mapping ucasmap;
+ struct kvm_s390_ucas_mapping ucas;
- if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
- r = -EFAULT;
+ r = -EFAULT;
+ if (copy_from_user(&ucas, argp, sizeof(ucas)))
break;
- }
- if (!kvm_is_ucontrol(vcpu->kvm)) {
- r = -EINVAL;
+ r = -EINVAL;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ break;
+ if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE))
break;
- }
- r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr,
- ucasmap.vcpu_addr, ucasmap.length);
+ r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr),
+ gpa_to_gfn(ucas.vcpu_addr),
+ ucas.length >> _SEGMENT_SHIFT);
break;
}
case KVM_S390_UCAS_UNMAP: {
- struct kvm_s390_ucas_mapping ucasmap;
+ struct kvm_s390_ucas_mapping ucas;
- if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
- r = -EFAULT;
+ r = -EFAULT;
+ if (copy_from_user(&ucas, argp, sizeof(ucas)))
break;
- }
- if (!kvm_is_ucontrol(vcpu->kvm)) {
- r = -EINVAL;
+ r = -EINVAL;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ break;
+ if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE))
break;
- }
- r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr,
- ucasmap.length);
+ gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr),
+ ucas.length >> _SEGMENT_SHIFT);
+ r = 0;
break;
}
#endif
case KVM_S390_VCPU_FAULT: {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = vcpu_dat_fault_handler(vcpu, arg, 0);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ gpa_t gaddr = arg;
+
+ scoped_guard(srcu, &vcpu->kvm->srcu) {
+ r = vcpu_ucontrol_translate(vcpu, &gaddr);
+ if (r)
+ break;
+
+ r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false);
+ if (r == PGM_ADDRESSING)
+ r = -EFAULT;
+ if (r <= 0)
+ break;
+ r = -EIO;
+ KVM_BUG_ON(r, vcpu->kvm);
+ }
break;
}
case KVM_ENABLE_CAP:
@@ -6012,9 +5654,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- gpa_t size;
-
- if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS)
+ if (kvm_is_ucontrol(kvm) && new && new->id < KVM_USER_MEM_SLOTS)
return -EINVAL;
/* When we are protected, we should not change the memory slots */
@@ -6023,20 +5663,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
/*
- * A few sanity checks. We can have memory slots which have to be
- * located/ended at a segment boundary (1MB). The memory in userland is
- * ok to be fragmented into various different vmas. It is okay to mmap()
- * and munmap() stuff in this slot after doing this call at any time
+ * A few sanity checks. The memory in userland is ok to be
+ * fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any
+ * time.
*/
-
- if (new->userspace_addr & 0xffffful)
+ if (new->userspace_addr & ~PAGE_MASK)
return -EINVAL;
-
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
-
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit)
return -EINVAL;
}
@@ -6064,37 +5698,89 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ struct kvm_s390_mmu_cache *mc = NULL;
int rc = 0;
- if (kvm_is_ucontrol(kvm))
+ if (change == KVM_MR_FLAGS_ONLY)
return;
- switch (change) {
- case KVM_MR_DELETE:
- rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
- old->npages * PAGE_SIZE);
- break;
- case KVM_MR_MOVE:
- rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
- old->npages * PAGE_SIZE);
- if (rc)
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ scoped_guard(write_lock, &kvm->mmu_lock) {
+ switch (change) {
+ case KVM_MR_DELETE:
+ rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages);
break;
- fallthrough;
- case KVM_MR_CREATE:
- rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr,
- new->base_gfn * PAGE_SIZE,
- new->npages * PAGE_SIZE);
- break;
- case KVM_MR_FLAGS_ONLY:
- break;
- default:
- WARN(1, "Unknown KVM MR CHANGE: %d\n", change);
+ case KVM_MR_MOVE:
+ rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages);
+ if (rc)
+ break;
+ fallthrough;
+ case KVM_MR_CREATE:
+ rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages);
+ break;
+ case KVM_MR_FLAGS_ONLY:
+ break;
+ default:
+ WARN(1, "Unknown KVM MR CHANGE: %d\n", change);
+ }
}
+out:
if (rc)
pr_warn("failed to commit memory region\n");
+ kvm_s390_free_mmu_cache(mc);
return;
}
+/**
+ * kvm_test_age_gfn() - test young
+ * @kvm: the kvm instance
+ * @range: the range of guest addresses whose young status needs to be cleared
+ *
+ * Context: called by KVM common code without holding the kvm mmu lock
+ * Return: true if any page in the given range is young, otherwise 0.
+ */
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end);
+}
+
+/**
+ * kvm_age_gfn() - clear young
+ * @kvm: the kvm instance
+ * @range: the range of guest addresses whose young status needs to be cleared
+ *
+ * Context: called by KVM common code without holding the kvm mmu lock
+ * Return: true if any page in the given range was young, otherwise 0.
+ */
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ return gmap_age_gfn(kvm->arch.gmap, range->start, range->end);
+}
+
+/**
+ * kvm_unmap_gfn_range() - Unmap a range of guest addresses
+ * @kvm: the kvm instance
+ * @range: the range of guest page frames to invalidate
+ *
+ * This function always returns false because every DAT table modification
+ * has to use the appropriate DAT table manipulation instructions, which will
+ * keep the TLB coherent, hence no additional TLB flush is ever required.
+ *
+ * Context: called by KVM common code with the kvm mmu write lock held
+ * Return: false
+ */
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end);
+}
+
static inline unsigned long nonhyp_mask(int i)
{
unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30;
@@ -6111,11 +5797,6 @@ static int __init kvm_s390_init(void)
return -ENODEV;
}
- if (nested && hpage) {
- pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n");
- return -EINVAL;
- }
-
for (i = 0; i < 16; i++)
kvm_s390_fac_base[i] |=
stfle_fac_list[i] & nonhyp_mask(i);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8d3bbb2dd8d2..dc0573b7aa4b 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -19,9 +19,19 @@
#include <asm/facility.h>
#include <asm/processor.h>
#include <asm/sclp.h>
+#include "dat.h"
+#include "gmap.h"
#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+union kvm_s390_quad {
+ __uint128_t sixteen;
+ unsigned long eight;
+ unsigned int four;
+ unsigned short two;
+ unsigned char one;
+};
+
static inline void kvm_s390_fpu_store(struct kvm_run *run)
{
fpu_stfpc(&run->s.regs.fpc);
@@ -106,15 +116,15 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
static inline int kvm_is_ucontrol(struct kvm *kvm)
{
#ifdef CONFIG_KVM_S390_UCONTROL
- if (kvm->arch.gmap)
- return 0;
- return 1;
+ return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags);
#else
return 0;
#endif
}
-#define GUEST_PREFIX_SHIFT 13
+#define GUEST_PREFIX_SHIFT 12
+#define GUEST_PREFIX_MASK_ZARCH 0x7fffe
+#define GUEST_PREFIX_MASK_ESA 0x7ffff
static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu)
{
return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT;
@@ -125,6 +135,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
VCPU_EVENT(vcpu, 3, "set prefix of cpu %03u to 0x%x", vcpu->vcpu_id,
prefix);
vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT;
+ vcpu->arch.sie_block->prefix &= GUEST_PREFIX_MASK_ZARCH;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
}
@@ -308,6 +319,9 @@ int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
u16 *rc, u16 *rrc);
+int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr);
+int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr);
+int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb);
static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
{
@@ -319,6 +333,41 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
return vcpu->arch.pv.handle;
}
+/**
+ * __kvm_s390_pv_destroy_page() - Destroy a guest page.
+ * @page: the page to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: must be called holding the mm lock for gmap->mm
+ */
+static inline int __kvm_s390_pv_destroy_page(struct page *page)
+{
+ struct folio *folio = page_folio(page);
+ int rc;
+
+ /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */
+ if (folio_test_large(folio))
+ return -EFAULT;
+
+ rc = uv_destroy_folio(folio);
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_from_secure_folio(folio);
+
+ return rc;
+}
+
/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
@@ -394,8 +443,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu);
/* implemented in vsie.c */
int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end);
void kvm_s390_vsie_init(struct kvm *kvm);
void kvm_s390_vsie_destroy(struct kvm *kvm);
@@ -419,14 +467,10 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags);
int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
unsigned long bits);
-static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags)
-{
- return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags);
-}
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
@@ -528,13 +572,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
-/* support for Basic/Extended SCA handling */
-static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
-{
- struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */
-
- return &sca->ipte_control;
-}
static inline int kvm_s390_use_sca_entries(void)
{
/*
@@ -542,7 +579,7 @@ static inline int kvm_s390_use_sca_entries(void)
* might use the entries. By not setting the entries and keeping them
* invalid, hardware will not access them but intercept.
*/
- return sclp.has_sigpif;
+ return sclp.has_sigpif && sclp.has_esca;
}
void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
struct mcck_volatile_info *mcck_info);
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index 8c40154ff50f..86d93e8dddae 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -54,7 +54,7 @@ static int zpci_setup_aipb(u8 nisc)
struct page *page;
int size, rc;
- zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL);
+ zpci_aipb = kzalloc_obj(union zpci_sic_iib);
if (!zpci_aipb)
return -ENOMEM;
@@ -126,8 +126,7 @@ int kvm_s390_pci_aen_init(u8 nisc)
return -EPERM;
mutex_lock(&aift->aift_lock);
- aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *),
- GFP_KERNEL);
+ aift->kzdev = kzalloc_objs(struct kvm_zdev *, ZPCI_NR_DEVICES);
if (!aift->kzdev) {
rc = -ENOMEM;
goto unlock;
@@ -404,7 +403,7 @@ static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
{
struct kvm_zdev *kzdev;
- kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL);
+ kzdev = kzalloc_obj(struct kvm_zdev);
if (!kzdev)
return -ENOMEM;
@@ -666,7 +665,7 @@ int __init kvm_s390_pci_init(void)
if (!kvm_s390_pci_interp_allowed())
return 0;
- aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
+ aift = kzalloc_obj(struct zpci_aift);
if (!aift)
return -ENOMEM;
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 1a49b89706f8..cc0553da14cb 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -21,13 +21,14 @@
#include <asm/ebcdic.h>
#include <asm/sysinfo.h>
#include <asm/page-states.h>
-#include <asm/gmap.h>
#include <asm/ptrace.h>
#include <asm/sclp.h>
#include <asm/ap.h>
+#include <asm/gmap_helpers.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
+#include "gmap.h"
static int handle_ri(struct kvm_vcpu *vcpu)
{
@@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
if (vcpu->arch.skey_enabled)
return 0;
- rc = s390_enable_skey();
+ rc = gmap_enable_skeys(vcpu->arch.gmap);
VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
if (rc)
return rc;
@@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
static int handle_iske(struct kvm_vcpu *vcpu)
{
- unsigned long gaddr, vmaddr;
- unsigned char key;
+ unsigned long gaddr;
int reg1, reg2;
- bool unlocked;
+ union skey key;
int rc;
vcpu->stat.instruction_iske++;
@@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu)
gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
- vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
- unlocked = false;
- mmap_read_lock(current->mm);
- rc = get_guest_storage_key(current->mm, vmaddr, &key);
-
- if (rc) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- if (!rc) {
- mmap_read_unlock(current->mm);
- goto retry;
- }
- }
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key);
+ if (rc > 0)
+ return kvm_s390_inject_program_int(vcpu, rc);
if (rc < 0)
return rc;
vcpu->run->s.regs.gprs[reg1] &= ~0xff;
- vcpu->run->s.regs.gprs[reg1] |= key;
+ vcpu->run->s.regs.gprs[reg1] |= key.skey;
return 0;
}
static int handle_rrbe(struct kvm_vcpu *vcpu)
{
- unsigned long vmaddr, gaddr;
+ unsigned long gaddr;
int reg1, reg2;
- bool unlocked;
int rc;
vcpu->stat.instruction_rrbe++;
@@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
- vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
- unlocked = false;
- mmap_read_lock(current->mm);
- rc = reset_guest_reference_bit(current->mm, vmaddr);
- if (rc < 0) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- if (!rc) {
- mmap_read_unlock(current->mm);
- goto retry;
- }
- }
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr));
+ if (rc > 0)
+ return kvm_s390_inject_program_int(vcpu, rc);
if (rc < 0)
return rc;
kvm_s390_set_psw_cc(vcpu, rc);
@@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu)
{
unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
unsigned long start, end;
- unsigned char key, oldkey;
+ union skey key, oldkey;
int reg1, reg2;
- bool unlocked;
int rc;
vcpu->stat.instruction_sske++;
@@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
- key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+ key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
if (m3 & SSKE_MB) {
@@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu)
}
while (start != end) {
- unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
- unlocked = false;
-
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
- mmap_read_lock(current->mm);
- rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
- m3 & SSKE_NQ, m3 & SSKE_MR,
- m3 & SSKE_MC);
-
- if (rc < 0) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- rc = !rc ? -EAGAIN : rc;
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+ rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+ gpa_to_gfn(start), key, &oldkey,
+ m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC);
}
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
+ if (rc > 1)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- if (rc == -EAGAIN)
+ if (rc == -ENOMEM) {
+ kvm_s390_mmu_cache_topup(vcpu->arch.mc);
continue;
+ }
if (rc < 0)
return rc;
start += PAGE_SIZE;
@@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
} else {
kvm_s390_set_psw_cc(vcpu, rc);
vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
- vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+ vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8;
}
}
if (m3 & SSKE_MB) {
@@ -605,6 +564,14 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
}
}
+#if IS_ENABLED(CONFIG_VFIO_AP)
+bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa)
+{
+ return kvm_is_gpa_in_memslot(kvm, gpa);
+}
+EXPORT_SYMBOL_FOR_MODULES(kvm_s390_is_gpa_in_memslot, "vfio_ap");
+#endif
+
/*
* handle_pqap: Handling pqap interception
* @vcpu: the vcpu having issue the pqap instruction
@@ -746,13 +713,14 @@ int is_valid_psw(psw_t *psw)
int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
{
psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
- psw_compat_t new_psw;
- u64 addr;
+ psw32_t new_psw;
+ u64 addr, iaddr;
int rc;
u8 ar;
vcpu->stat.instruction_lpsw++;
+ iaddr = gpsw->addr - kvm_s390_get_ilen(vcpu);
if (gpsw->mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -770,18 +738,20 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE;
if (!is_valid_psw(gpsw))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ vcpu->arch.sie_block->gbea = iaddr;
return 0;
}
static int handle_lpswe(struct kvm_vcpu *vcpu)
{
psw_t new_psw;
- u64 addr;
+ u64 addr, iaddr;
int rc;
u8 ar;
vcpu->stat.instruction_lpswe++;
+ iaddr = vcpu->arch.sie_block->gpsw.addr - kvm_s390_get_ilen(vcpu);
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -794,6 +764,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gpsw = new_psw;
if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ vcpu->arch.sie_block->gbea = iaddr;
return 0;
}
@@ -1074,7 +1045,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
bool mr = false, mc = false, nq;
int reg1, reg2;
unsigned long start, end;
- unsigned char key;
+ union skey key;
vcpu->stat.instruction_pfmf++;
@@ -1102,7 +1073,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
- key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
+ key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
@@ -1133,14 +1104,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
while (start != end) {
- unsigned long vmaddr;
- bool unlocked = false;
-
- /* Translate guest address to host address */
- vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -1151,19 +1114,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- mmap_read_lock(current->mm);
- rc = cond_set_guest_storage_key(current->mm, vmaddr,
- key, NULL, nq, mr, mc);
- if (rc < 0) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- rc = !rc ? -EAGAIN : rc;
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+ rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+ gpa_to_gfn(start), key,
+ NULL, nq, mr, mc);
}
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- if (rc == -EAGAIN)
+ if (rc > 1)
+ return kvm_s390_inject_program_int(vcpu, rc);
+ if (rc == -ENOMEM) {
+ kvm_s390_mmu_cache_topup(vcpu->arch.mc);
continue;
+ }
if (rc < 0)
return rc;
}
@@ -1187,8 +1148,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
{
int r1, r2, nappended, entries;
- unsigned long gfn, hva, res, pgstev, ptev;
+ union essa_state state;
unsigned long *cbrlo;
+ unsigned long gfn;
+ bool dirtied;
/*
* We don't need to set SD.FPF.SK to 1 here, because if we have a
@@ -1197,33 +1160,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
kvm_s390_get_regs_rre(vcpu, &r1, &r2);
gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
- hva = gfn_to_hva(vcpu->kvm, gfn);
entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
- if (kvm_is_error_hva(hva))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
- nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
- if (nappended < 0) {
- res = orc ? 0x10 : 0;
- vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+ nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied);
+ vcpu->run->s.regs.gprs[r1] = state.val;
+ if (nappended < 0)
return 0;
- }
- res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
- /*
- * Set the block-content state part of the result. 0 means resident, so
- * nothing to do if the page is valid. 2 is for preserved pages
- * (non-present and non-zero), and 3 for zero pages (non-present and
- * zero).
- */
- if (ptev & _PAGE_INVALID) {
- res |= 2;
- if (pgstev & _PGSTE_GPS_ZERO)
- res |= 1;
- }
- if (pgstev & _PGSTE_GPS_NODAT)
- res |= 0x20;
- vcpu->run->s.regs.gprs[r1] = res;
/*
* It is possible that all the normal 511 slots were full, in which case
* we will now write in the 512th slot, which is reserved for host use.
@@ -1235,27 +1177,44 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
cbrlo[entries] = gfn << PAGE_SHIFT;
}
- if (orc) {
- struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
-
- /* Increment only if we are really flipping the bit */
- if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
- atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
- }
+ if (dirtied)
+ atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
return nappended;
}
+static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len)
+{
+ union crste *crstep;
+ union pgste pgste;
+ union pte *ptep;
+ int i;
+
+ lockdep_assert_held(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < len; i++) {
+ if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce,
+ 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep))
+ continue;
+ if (!ptep || ptep->s.pr)
+ continue;
+ pgste = pgste_get_lock(ptep);
+ if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero)
+ gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]);
+ pgste_set_unlock(ptep, pgste);
+ }
+}
+
static int handle_essa(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_held(&vcpu->kvm->srcu);
+
/* entries expected to be 1FF */
int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
unsigned long *cbrlo;
- struct gmap *gmap;
int i, orc;
VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
- gmap = vcpu->arch.gmap;
vcpu->stat.instruction_essa++;
if (!vcpu->kvm->arch.use_cmma)
return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
@@ -1279,11 +1238,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
* value really needs to be written to; if the value is
* already correct, we do nothing and avoid the lock.
*/
- if (vcpu->kvm->mm->context.uses_cmm == 0) {
- mmap_write_lock(vcpu->kvm->mm);
- vcpu->kvm->mm->context.uses_cmm = 1;
- mmap_write_unlock(vcpu->kvm->mm);
- }
+ set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags);
/*
* If we are here, we are supposed to have CMMA enabled in
* the SIE block. Enabling CMMA works on a per-CPU basis,
@@ -1297,24 +1252,22 @@ static int handle_essa(struct kvm_vcpu *vcpu)
/* Retry the ESSA instruction */
kvm_s390_retry_instr(vcpu);
} else {
- int srcu_idx;
-
- mmap_read_lock(vcpu->kvm->mm);
- srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- i = __do_essa(vcpu, orc);
- srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
- mmap_read_unlock(vcpu->kvm->mm);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ i = __do_essa(vcpu, orc);
if (i < 0)
return i;
/* Account for the possible extra cbrl entry */
entries += i;
}
- vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
+ /* reset nceo */
+ vcpu->arch.sie_block->cbrlo &= PAGE_MASK;
cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
- mmap_read_lock(gmap->mm);
- for (i = 0; i < entries; ++i)
- __gmap_zap(gmap, cbrlo[i]);
- mmap_read_unlock(gmap->mm);
+
+ mmap_read_lock(vcpu->kvm->mm);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ _essa_clear_cbrl(vcpu, cbrlo, entries);
+ mmap_read_unlock(vcpu->kvm->mm);
+
return 0;
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 22c012aa5206..c2dafd812a3b 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -5,19 +5,23 @@
* Copyright IBM Corp. 2019, 2020
* Author(s): Janosch Frank <frankja@linux.ibm.com>
*/
+
+#include <linux/export.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/minmax.h>
#include <linux/pagemap.h>
#include <linux/sched/signal.h>
-#include <asm/gmap.h>
#include <asm/uv.h>
#include <asm/mman.h>
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
+#include "dat.h"
+#include "gaccess.h"
#include "gmap.h"
+#include "faultin.h"
bool kvm_s390_pv_is_protected(struct kvm *kvm)
{
@@ -34,6 +38,163 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
/**
+ * should_export_before_import() - Determine whether an export is needed
+ * before an import-like operation.
+ * @uvcb: The Ultravisor control block of the UVC to be performed.
+ * @mm: The mm of the process.
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: %true if an export is needed before every import, otherwise %false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+ /*
+ * The misc feature indicates, among other things, that importing a
+ * shared page from a different protected VM will automatically also
+ * transfer its ownership.
+ */
+ if (uv_has_feature(BIT_UV_FEAT_MISC))
+ return false;
+ if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+ return false;
+ return atomic_read(&mm->context.protected_count) > 1;
+}
+
+struct pv_make_secure {
+ void *uvcb;
+ struct folio *folio;
+ int rc;
+ bool needs_export;
+};
+
+static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio)
+{
+ struct pv_make_secure *priv = f->priv;
+ int rc;
+
+ if (priv->needs_export)
+ uv_convert_from_secure(folio_to_phys(folio));
+
+ if (folio_test_hugetlb(folio))
+ return -EFAULT;
+ if (folio_test_large(folio))
+ return -E2BIG;
+
+ if (!f->page)
+ folio_get(folio);
+ rc = __make_folio_secure(folio, priv->uvcb);
+ if (!f->page)
+ folio_put(folio);
+
+ return rc;
+}
+
+static void _kvm_s390_pv_make_secure(struct guest_fault *f)
+{
+ struct pv_make_secure *priv = f->priv;
+ struct folio *folio;
+
+ folio = pfn_folio(f->pfn);
+ priv->rc = -EAGAIN;
+ if (folio_trylock(folio)) {
+ priv->rc = __kvm_s390_pv_make_secure(f, folio);
+ if (priv->rc == -E2BIG || priv->rc == -EBUSY) {
+ priv->folio = folio;
+ folio_get(folio);
+ }
+ folio_unlock(folio);
+ }
+}
+
+/**
+ * kvm_s390_pv_make_secure() - make one guest page secure
+ * @kvm: the guest
+ * @gaddr: the guest address that needs to be made secure
+ * @uvcb: the UVCB specifying which operation needs to be performed
+ *
+ * Context: needs to be called with kvm->srcu held.
+ * Return: 0 on success, < 0 in case of error.
+ */
+int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb)
+{
+ struct pv_make_secure priv = { .uvcb = uvcb };
+ struct guest_fault f = {
+ .write_attempt = true,
+ .gfn = gpa_to_gfn(gaddr),
+ .callback = _kvm_s390_pv_make_secure,
+ .priv = &priv,
+ };
+ int rc;
+
+ lockdep_assert_held(&kvm->srcu);
+
+ priv.needs_export = should_export_before_import(uvcb, kvm->mm);
+
+ scoped_guard(mutex, &kvm->arch.pv.import_lock) {
+ rc = kvm_s390_faultin_gfn(NULL, kvm, &f);
+
+ if (!rc) {
+ rc = priv.rc;
+ if (priv.folio) {
+ rc = s390_wiggle_split_folio(kvm->mm, priv.folio);
+ if (!rc)
+ rc = -EAGAIN;
+ }
+ }
+ }
+ if (priv.folio)
+ folio_put(priv.folio);
+ return rc;
+}
+
+int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr)
+{
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ .gaddr = gaddr,
+ };
+
+ return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb);
+}
+
+/**
+ * kvm_s390_pv_destroy_page() - Destroy a guest page.
+ * @kvm: the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: may sleep.
+ */
+int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr)
+{
+ struct page *page;
+ int rc = 0;
+
+ mmap_read_lock(kvm->mm);
+ page = gfn_to_page(kvm, gpa_to_gfn(gaddr));
+ if (page)
+ rc = __kvm_s390_pv_destroy_page(page);
+ kvm_release_page_clean(page);
+ mmap_read_unlock(kvm->mm);
+ return rc;
+}
+
+/**
* struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
* be destroyed
*
@@ -240,35 +401,6 @@ done_fast:
return 0;
}
-/**
- * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
- * @kvm: the VM whose memory is to be cleared.
- *
- * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
- * The CPUs of the protected VM need to be destroyed beforehand.
- */
-static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
-{
- const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
- struct kvm_memory_slot *slot;
- unsigned long len;
- int srcu_idx;
-
- srcu_idx = srcu_read_lock(&kvm->srcu);
-
- /* Take the memslot containing guest absolute address 0 */
- slot = gfn_to_memslot(kvm, 0);
- /* Clear all slots or parts thereof that are below 2GB */
- while (slot && slot->base_gfn < pages_2g) {
- len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
- s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
- /* Take the next memslot */
- slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
- }
-
- srcu_read_unlock(&kvm->srcu, srcu_idx);
-}
-
static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
{
struct uv_cb_destroy_fast uvcb = {
@@ -283,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
*rc = uvcb.header.rc;
if (rrc)
*rrc = uvcb.header.rrc;
- WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
uvcb.header.rc, uvcb.header.rrc);
WARN_ONCE(cc && uvcb.header.rc != 0x104,
@@ -332,10 +463,10 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
return -EINVAL;
/* Guest with segment type ASCE, refuse to destroy asynchronously */
- if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+ if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT)
return -EINVAL;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ priv = kzalloc_obj(*priv);
if (!priv)
return -ENOMEM;
@@ -345,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
priv->stor_var = kvm->arch.pv.stor_var;
priv->stor_base = kvm->arch.pv.stor_base;
priv->handle = kvm_s390_pv_get_handle(kvm);
- priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
- WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce);
if (s390_replace_asce(kvm->arch.gmap))
res = -ENOMEM;
}
@@ -356,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
return res;
}
- kvm_s390_destroy_lower_2g(kvm);
+ gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false);
kvm_s390_clear_pv_state(kvm);
kvm->arch.pv.set_aside = priv;
@@ -390,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
- WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
if (!cc) {
atomic_dec(&kvm->mm->context.protected_count);
kvm_s390_pv_dealloc_vm(kvm);
@@ -473,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
* cleanup has been performed.
*/
if (need_zap && mmget_not_zero(kvm->mm)) {
- s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+ gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false);
mmput(kvm->mm);
}
@@ -511,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
return -EINVAL;
/* When a fatal signal is received, stop immediately */
- if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+ if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true))
goto done;
if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
ret = -EIO;
@@ -550,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
+ set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags);
}
static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
@@ -565,6 +695,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
int cc, ret;
u16 dummy;
+ /* Add the notifier only once. No races because we hold kvm->lock */
+ if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+ /* The notifier will be unregistered when the VM is destroyed */
+ kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+ ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+ if (ret) {
+ kvm->arch.pv.mmu_notifier.ops = NULL;
+ return ret;
+ }
+ }
+
ret = kvm_s390_pv_alloc_vm(kvm);
if (ret)
return ret;
@@ -572,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
/* Inputs */
uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
uvcb.guest_stor_len = kvm->arch.pv.guest_len;
- uvcb.guest_asce = kvm->arch.gmap->asce;
+ uvcb.guest_asce = kvm->arch.gmap->asce.val;
uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
uvcb.conf_base_stor_origin =
virt_to_phys((void *)kvm->arch.pv.stor_base);
@@ -580,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;
+ clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags);
+ gmap_split_huge_pages(kvm->arch.gmap);
+
cc = uv_call_sched(0, (u64)&uvcb);
*rc = uvcb.header.rc;
*rrc = uvcb.header.rrc;
@@ -599,12 +743,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
}
return -EIO;
}
- kvm->arch.gmap->guest_handle = uvcb.guest_handle;
- /* Add the notifier only once. No races because we hold kvm->lock */
- if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
- kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
- mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
- }
return 0;
}
@@ -638,27 +776,15 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
.tweak[0] = tweak,
.tweak[1] = offset,
};
- int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
- unsigned long vmaddr;
- bool unlocked;
+ int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb);
*rc = uvcb.header.rc;
*rrc = uvcb.header.rrc;
if (ret == -ENXIO) {
- mmap_read_lock(kvm->mm);
- vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
- if (kvm_is_error_hva(vmaddr)) {
- ret = -EFAULT;
- } else {
- ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
- if (!ret)
- ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
- }
- mmap_read_unlock(kvm->mm);
+ ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true);
if (!ret)
return -EAGAIN;
- return ret;
}
if (ret && ret != -EAGAIN)
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 9ac92dbf680d..9e28f165c114 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu,
__entry->sie_block = sie_block;
),
- TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK",
+ TP_printk("create cpu %d at 0x%p, sie block at 0x%p",
__entry->id, __entry->vcpu, __entry->sie_block)
);
@@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css,
__entry->kvm = kvm;
),
- TP_printk("enabling channel I/O support (kvm @ %pK)\n",
+ TP_printk("enabling channel I/O support (kvm @ %p)\n",
__entry->kvm)
);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index a78df3a4f353..e5a23f1c9749 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -15,7 +15,6 @@
#include <linux/io.h>
#include <linux/mman.h>
-#include <asm/gmap.h>
#include <asm/mmu_context.h>
#include <asm/sclp.h>
#include <asm/nmi.h>
@@ -42,8 +41,11 @@ struct vsie_page {
* are reused conditionally, should be accessed via READ_ONCE.
*/
struct kvm_s390_sie_block *scb_o; /* 0x0218 */
- /* the shadow gmap in use by the vsie_page */
- struct gmap *gmap; /* 0x0220 */
+ /*
+ * Flags: must be set/cleared atomically after the vsie page can be
+ * looked up by other CPUs.
+ */
+ unsigned long flags; /* 0x0220 */
/* address of the last reported fault to guest2 */
unsigned long fault_addr; /* 0x0228 */
/* calculated guest addresses of satellite control blocks */
@@ -58,16 +60,15 @@ struct vsie_page {
* radix tree.
*/
gpa_t scb_gpa; /* 0x0258 */
- /*
- * Flags: must be set/cleared atomically after the vsie page can be
- * looked up by other CPUs.
- */
- unsigned long flags; /* 0x0260 */
- __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */
+ /* the shadow gmap in use by the vsie_page */
+ struct gmap_cache gmap_cache; /* 0x0260 */
+ __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */
struct kvm_s390_crypto_cb crycb; /* 0x0700 */
__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
};
+static_assert(sizeof(struct vsie_page) == PAGE_SIZE);
+
/* trigger a validity icpt for the given scb */
static int set_validity_icpt(struct kvm_s390_sie_block *scb,
__u16 reason_code)
@@ -124,8 +125,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
- /* we don't allow ESA/390 guests */
- if (!(cpuflags & CPUSTAT_ZARCH))
+ /* we don't allow ESA/390 guests unless explicitly enabled */
+ if (!(cpuflags & CPUSTAT_ZARCH) && !vcpu->kvm->arch.allow_vsie_esamode)
return set_validity_icpt(scb_s, 0x0001U);
if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
@@ -134,7 +135,9 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return set_validity_icpt(scb_s, 0x0007U);
/* intervention requests will be set later */
- newflags = CPUSTAT_ZARCH;
+ newflags = 0;
+ if (cpuflags & CPUSTAT_ZARCH)
+ newflags = CPUSTAT_ZARCH;
if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
newflags |= CPUSTAT_GED;
if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
@@ -384,6 +387,17 @@ end:
return 0;
}
+static void shadow_esa(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+ /* Ensure these bits are indeed turned off */
+ scb_s->eca &= ~ECA_VX;
+ scb_s->ecb &= ~(ECB_GS | ECB_TE);
+ scb_s->ecb3 &= ~ECB3_RI;
+ scb_s->ecd &= ~ECD_HOSTREGMGMT;
+}
+
/* shadow (round up/down) the ibc to avoid validity icpt */
static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
@@ -465,7 +479,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
/* READ_ONCE does not work on bitfields - use a temporary variable */
const uint32_t __new_prefix = scb_o->prefix;
- const uint32_t new_prefix = READ_ONCE(__new_prefix);
+ uint32_t new_prefix = READ_ONCE(__new_prefix);
const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE;
bool had_tx = scb_s->ecb & ECB_TE;
unsigned long new_mso = 0;
@@ -513,6 +527,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->icpua = scb_o->icpua;
+ if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_ZARCH))
+ new_prefix &= GUEST_PREFIX_MASK_ESA;
+ else
+ new_prefix &= GUEST_PREFIX_MASK_ZARCH;
+
if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL;
/* if the hva of the prefix changes, we have to remap the prefix */
@@ -587,6 +606,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->hpid = HPID_VSIE;
scb_s->cpnc = scb_o->cpnc;
+ if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_ZARCH))
+ shadow_esa(vcpu, vsie_page);
+
prepare_ibc(vcpu, vsie_page);
rc = shadow_crycb(vcpu, vsie_page);
out:
@@ -595,26 +617,17 @@ out:
return rc;
}
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end)
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end)
{
- struct kvm *kvm = gmap->private;
- struct vsie_page *cur;
+ struct vsie_page *cur, *next;
unsigned long prefix;
- int i;
- if (!gmap_is_shadow(gmap))
- return;
+ KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm);
/*
* Only new shadow blocks are added to the list during runtime,
* therefore we can safely reference them all the time.
*/
- for (i = 0; i < kvm->arch.vsie.page_count; i++) {
- cur = READ_ONCE(kvm->arch.vsie.pages[i]);
- if (!cur)
- continue;
- if (READ_ONCE(cur->gmap) != gmap)
- continue;
+ list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) {
prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
/* with mso/msl, the prefix lies at an offset */
prefix += cur->scb_s.mso;
@@ -635,7 +648,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
* - -EAGAIN if the caller can retry immediately
* - -ENOMEM if out of memory
*/
-static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
@@ -650,10 +663,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
+ rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true);
if (!rc && (scb_s->ecb & ECB_TE))
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE, NULL);
+ rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -765,7 +777,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
rc = set_validity_icpt(scb_s, 0x0011U);
else if ((gpa & PAGE_MASK) !=
- ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK))
rc = set_validity_icpt(scb_s, 0x003bU);
if (!rc) {
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
@@ -934,8 +946,9 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
* - > 0 if control has to be given to guest 2
* - < 0 if an error occurred
*/
-static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
{
+ bool wr = kvm_s390_cur_gmap_fault_is_write();
int rc;
if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
@@ -943,12 +956,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return inject_fault(vcpu, PGM_PROTECTION,
current->thread.gmap_teid.addr * PAGE_SIZE, 1);
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
+ rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
- current->thread.gmap_teid.addr * PAGE_SIZE,
- kvm_s390_cur_gmap_fault_is_write());
+ current->thread.gmap_teid.addr * PAGE_SIZE, wr);
if (rc >= 0)
vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
}
@@ -961,12 +972,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
*
* Will ignore any errors. The next SIE fault will do proper fault handling.
*/
-static void handle_last_fault(struct kvm_vcpu *vcpu,
- struct vsie_page *vsie_page)
+static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
{
if (vsie_page->fault_addr)
- kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr, NULL);
+ gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true);
vsie_page->fault_addr = 0;
}
@@ -1048,11 +1057,12 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
}
}
-static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ unsigned long src, dest, mask, prefix;
u64 *pei_block = &vsie_page->scb_o->mcic;
+ union mvpg_pei pei_dest, pei_src;
int edat, rc_dest, rc_src;
union ctlreg0 cr0;
@@ -1066,8 +1076,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
- rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
- rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true);
+ rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false);
/*
* Either everything went well, or something non-critical went wrong
* e.g. because of a race. In either case, simply retry.
@@ -1102,8 +1112,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
}
if (!rc_dest && !rc_src) {
- pei_block[0] = pei_dest;
- pei_block[1] = pei_src;
+ pei_block[0] = pei_dest.val;
+ pei_block[1] = pei_src.val;
return 1;
}
@@ -1127,16 +1137,17 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* - > 0 if control has to be given to guest 2
* - < 0 if an error occurred
*/
-static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
__releases(vcpu->kvm->srcu)
__acquires(vcpu->kvm->srcu)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ unsigned long sie_return = SIE64_RETURN_NORMAL;
int guest_bp_isolation;
int rc = 0;
- handle_last_fault(vcpu, vsie_page);
+ handle_last_fault(vcpu, vsie_page, sg);
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -1153,10 +1164,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->fpf & FPF_BPBC)
set_thread_flag(TIF_ISOLATE_BP_GUEST);
- local_irq_disable();
- guest_enter_irqoff();
- local_irq_enable();
-
/*
* Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
* and VCPU requests also hinder the vSIE from running and lead
@@ -1166,31 +1173,44 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
current->thread.gmap_int_code = 0;
barrier();
- if (!kvm_s390_vcpu_sie_inhibited(vcpu))
- rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ if (!kvm_s390_vcpu_sie_inhibited(vcpu)) {
+xfer_to_guest_mode_check:
+ local_irq_disable();
+ xfer_to_guest_mode_prepare();
+ if (xfer_to_guest_mode_work_pending()) {
+ local_irq_enable();
+ rc = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ if (rc)
+ goto skip_sie;
+ goto xfer_to_guest_mode_check;
+ }
+ guest_timing_enter_irqoff();
+ sie_return = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+ }
+
+skip_sie:
barrier();
vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
- local_irq_disable();
- guest_exit_irqoff();
- local_irq_enable();
-
/* restore guest state for bp isolation override */
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
kvm_vcpu_srcu_read_lock(vcpu);
- if (rc == -EINTR) {
- VCPU_EVENT(vcpu, 3, "%s", "machine check");
+ if (sie_return == SIE64_RETURN_MCCK) {
kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info);
return 0;
}
+ WARN_ON_ONCE(sie_return != SIE64_RETURN_NORMAL);
+
if (rc > 0)
rc = 0; /* we could still have an icpt */
else if (current->thread.gmap_int_code)
- return handle_fault(vcpu, vsie_page);
+ return handle_fault(vcpu, vsie_page, sg);
switch (scb_s->icptcode) {
case ICPT_INST:
@@ -1208,7 +1228,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
break;
case ICPT_PARTEXEC:
if (scb_s->ipa == 0xb254)
- rc = vsie_handle_mvpg(vcpu, vsie_page);
+ rc = vsie_handle_mvpg(vcpu, vsie_page, sg);
break;
}
return rc;
@@ -1216,43 +1236,67 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
static void release_gmap_shadow(struct vsie_page *vsie_page)
{
- if (vsie_page->gmap)
- gmap_put(vsie_page->gmap);
- WRITE_ONCE(vsie_page->gmap, NULL);
+ struct gmap *gmap = vsie_page->gmap_cache.gmap;
+
+ lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock);
+
+ list_del(&vsie_page->gmap_cache.list);
+ vsie_page->gmap_cache.gmap = NULL;
prefix_unmapped(vsie_page);
+
+ if (list_empty(&gmap->scb_users)) {
+ gmap_remove_child(gmap);
+ gmap_put(gmap);
+ }
}
-static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
- struct vsie_page *vsie_page)
+static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
- unsigned long asce;
union ctlreg0 cr0;
struct gmap *gmap;
+ union asce asce;
int edat;
- asce = vcpu->arch.sie_block->gcr[1];
+ asce.val = vcpu->arch.sie_block->gcr[1];
cr0.val = vcpu->arch.sie_block->gcr[0];
edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
edat += edat && test_kvm_facility(vcpu->kvm, 78);
- /*
- * ASCE or EDAT could have changed since last icpt, or the gmap
- * we're holding has been unshadowed. If the gmap is still valid,
- * we can safely reuse it.
- */
- if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
- vcpu->kvm->stat.gmap_shadow_reuse++;
- return 0;
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+ gmap = vsie_page->gmap_cache.gmap;
+ if (gmap) {
+ /*
+ * ASCE or EDAT could have changed since last icpt, or the gmap
+ * we're holding has been unshadowed. If the gmap is still valid,
+ * we can safely reuse it.
+ */
+ if (gmap_is_shadow_valid(gmap, asce, edat)) {
+ vcpu->kvm->stat.gmap_shadow_reuse++;
+ gmap_get(gmap);
+ return gmap;
+ }
+ /* release the old shadow and mark the prefix as unmapped */
+ release_gmap_shadow(vsie_page);
+ }
}
-
- /* release the old shadow - if any, and mark the prefix as unmapped */
- release_gmap_shadow(vsie_page);
- gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+again:
+ gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat);
if (IS_ERR(gmap))
- return PTR_ERR(gmap);
- vcpu->kvm->stat.gmap_shadow_create++;
- WRITE_ONCE(vsie_page->gmap, gmap);
- return 0;
+ return gmap;
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+ /* unlikely race condition, remove the previous shadow */
+ if (vsie_page->gmap_cache.gmap)
+ release_gmap_shadow(vsie_page);
+ if (!gmap->parent) {
+ gmap_put(gmap);
+ goto again;
+ }
+ vcpu->kvm->stat.gmap_shadow_create++;
+ list_add(&vsie_page->gmap_cache.list, &gmap->scb_users);
+ vsie_page->gmap_cache.gmap = gmap;
+ prefix_unmapped(vsie_page);
+ }
+ return gmap;
}
/*
@@ -1305,15 +1349,20 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct gmap *sg = NULL;
int rc = 0;
while (1) {
- rc = acquire_gmap_shadow(vcpu, vsie_page);
+ sg = acquire_gmap_shadow(vcpu, vsie_page);
+ if (IS_ERR(sg)) {
+ rc = PTR_ERR(sg);
+ sg = NULL;
+ }
if (!rc)
- rc = map_prefix(vcpu, vsie_page);
+ rc = map_prefix(vcpu, vsie_page, sg);
if (!rc) {
update_intervention_requests(vsie_page);
- rc = do_vsie_run(vcpu, vsie_page);
+ rc = do_vsie_run(vcpu, vsie_page, sg);
}
atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
@@ -1331,14 +1380,17 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* but rewind the PSW to re-enter SIE once that's completed
* instead of passing a "no action" intercept to the guest.
*/
- if (signal_pending(current) ||
- kvm_s390_vcpu_has_irq(vcpu, 0) ||
+ if (kvm_s390_vcpu_has_irq(vcpu, 0) ||
kvm_s390_vcpu_sie_inhibited(vcpu)) {
kvm_s390_rewind_psw(vcpu, 4);
break;
}
+ if (sg)
+ sg = gmap_put(sg);
cond_resched();
}
+ if (sg)
+ sg = gmap_put(sg);
if (rc == -EFAULT) {
/*
@@ -1434,8 +1486,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
vsie_page->scb_gpa = ULONG_MAX;
/* Double use of the same address or allocation failure. */
- if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
- vsie_page)) {
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) {
put_vsie_page(vsie_page);
mutex_unlock(&kvm->arch.vsie.mutex);
return NULL;
@@ -1444,7 +1495,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
mutex_unlock(&kvm->arch.vsie.mutex);
memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
- release_gmap_shadow(vsie_page);
+ if (vsie_page->gmap_cache.gmap) {
+ scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+ if (vsie_page->gmap_cache.gmap)
+ release_gmap_shadow(vsie_page);
+ }
+ prefix_unmapped(vsie_page);
vsie_page->fault_addr = 0;
vsie_page->scb_s.ihcpu = 0xffffU;
return vsie_page;
@@ -1469,18 +1525,19 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
if (unlikely(scb_addr & 0x1ffUL))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
- kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) {
kvm_s390_rewind_psw(vcpu, 4);
return 0;
}
vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
- if (IS_ERR(vsie_page))
+ if (IS_ERR(vsie_page)) {
return PTR_ERR(vsie_page);
- else if (!vsie_page)
+ } else if (!vsie_page) {
/* double use of sie control block - simply do nothing */
+ kvm_s390_rewind_psw(vcpu, 4);
return 0;
+ }
rc = pin_scb(vcpu, vsie_page, scb_addr);
if (rc)
@@ -1521,8 +1578,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm)
mutex_lock(&kvm->arch.vsie.mutex);
for (i = 0; i < kvm->arch.vsie.page_count; i++) {
vsie_page = kvm->arch.vsie.pages[i];
+ scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+ if (vsie_page->gmap_cache.gmap)
+ release_gmap_shadow(vsie_page);
kvm->arch.vsie.pages[i] = NULL;
- release_gmap_shadow(vsie_page);
/* free the radix tree entry */
if (vsie_page->scb_gpa != ULONG_MAX)
radix_tree_delete(&kvm->arch.vsie.addr_to_page,