diff options
Diffstat (limited to 'arch/s390/kvm')
| -rw-r--r-- | arch/s390/kvm/Kconfig | 4 | ||||
| -rw-r--r-- | arch/s390/kvm/Makefile | 3 | ||||
| -rw-r--r-- | arch/s390/kvm/dat.c | 1321 | ||||
| -rw-r--r-- | arch/s390/kvm/dat.h | 976 | ||||
| -rw-r--r-- | arch/s390/kvm/diag.c | 32 | ||||
| -rw-r--r-- | arch/s390/kvm/faultin.c | 148 | ||||
| -rw-r--r-- | arch/s390/kvm/faultin.h | 92 | ||||
| -rw-r--r-- | arch/s390/kvm/gaccess.c | 1031 | ||||
| -rw-r--r-- | arch/s390/kvm/gaccess.h | 20 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap-vsie.c | 142 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.c | 1342 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.h | 252 | ||||
| -rw-r--r-- | arch/s390/kvm/guestdbg.c | 8 | ||||
| -rw-r--r-- | arch/s390/kvm/intercept.c | 29 | ||||
| -rw-r--r-- | arch/s390/kvm/interrupt.c | 168 | ||||
| -rw-r--r-- | arch/s390/kvm/kvm-s390.c | 1329 | ||||
| -rw-r--r-- | arch/s390/kvm/kvm-s390.h | 75 | ||||
| -rw-r--r-- | arch/s390/kvm/pci.c | 9 | ||||
| -rw-r--r-- | arch/s390/kvm/priv.c | 237 | ||||
| -rw-r--r-- | arch/s390/kvm/pv.c | 246 | ||||
| -rw-r--r-- | arch/s390/kvm/trace-s390.h | 4 | ||||
| -rw-r--r-- | arch/s390/kvm/vsie.c | 255 |
22 files changed, 5792 insertions, 1931 deletions
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..5b835bc6a194 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON @@ -29,7 +28,8 @@ config KVM select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL select KVM_VFIO - select MMU_NOTIFIER + select VIRT_XFER_TO_GUEST_WORK + select KVM_MMU_LOCKLESS_AGING help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index f0ffe874adc2..dac9d53b23d8 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,8 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-y += dat.o gmap.o faultin.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c new file mode 100644 index 000000000000..7b8d70fe406d --- /dev/null +++ b/arch/s390/kvm/dat.c @@ -0,0 +1,1321 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2020, 2024 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/pagewalk.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/ksm.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/pgtable.h> +#include <linux/kvm_types.h> +#include <linux/kvm_host.h> +#include <linux/pgalloc.h> + +#include <asm/page-states.h> +#include <asm/tlb.h> +#include "dat.h" + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc) +{ + void *o; + + for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) { + o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!o) + return -ENOMEM; + mc->crsts[mc->n_crsts] = o; + } + for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) { + o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->pts[mc->n_pts] = o; + } + for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) { + o = kzalloc_obj(*mc->rmaps[0], GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->rmaps[mc->n_rmaps] = o; + } + return 0; +} + +static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct page_table *res; + + res = kvm_s390_mmu_cache_alloc_pt(mc); + if (res) + __arch_set_page_dat(res, 1); + return res; +} + +static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct crst_table *res; + + res = kvm_s390_mmu_cache_alloc_crst(mc); + if (res) + __arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER); + return res; +} + +struct crst_table *dat_alloc_crst_sleepable(unsigned long init) +{ + struct page *page; + void *virt; + + page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!page) + return NULL; + virt = page_to_virt(page); + __arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER); + crst_table_init(virt, init); + return virt; +} + +void dat_free_level(struct crst_table *table, bool owns_ptes) +{ + unsigned int i; + + for (i = 0; i < _CRST_ENTRIES; i++) { + if (table->crstes[i].h.fc || table->crstes[i].h.i) + continue; + if (!is_pmd(table->crstes[i])) + dat_free_level(dereference_crste(table->crstes[i]), owns_ptes); + else if (owns_ptes) + dat_free_pt(dereference_pmd(table->crstes[i].pmd)); + } + dat_free_crst(table); +} + +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype) +{ + struct crst_table *table; + union crste crste; + + while (asce->dt > newtype) { + table = dereference_asce(*asce); + crste = table->crstes[0]; + if (crste.h.fc) + return 0; + if (!crste.h.i) { + asce->rsto = crste.h.fc0.to; + dat_free_crst(table); + } else { + crste.h.tt--; + crst_table_init((void *)table, crste.val); + } + asce->dt--; + } + while (asce->dt < newtype) { + crste = _crste_fc0(asce->rsto, asce->dt + 1); + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val); + table->crstes[0] = crste; + asce->rsto = __pa(table) >> PAGE_SHIFT; + asce->dt++; + } + return 0; +} + +/** + * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. + * @crstep: Pointer to the CRST entry. + * @old: Expected old value. + * @new: Replacement entry. + * @gfn: The affected guest address. + * @asce: The asce of the address space. + * + * This function is needed to atomically exchange a CRSTE that potentially + * maps a prefix area, without having to invalidate it inbetween. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %true if the exchange was successful. + */ +bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, + gfn_t gfn, union asce asce) +{ + if (old.h.i) + return arch_try_cmpxchg((long *)crstep, &old.val, new.val); + if (cpu_has_edat2()) + return crdte_crste(crstep, old, new, gfn, asce); + return cspg_crste(crstep, old, new); +} + +static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste) +{ + union skey nkey = { .acc = pgste.acc, .fp = pgste.fp }; + + page_set_storage_key(pte_origin(pte), nkey.skey, 0); +} + +static void dat_move_storage_key(union pte old, union pte new) +{ + page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1); +} + +static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste) +{ + union skey skey; + + skey.skey = page_get_storage_key(pte_origin(pte)); + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gr |= skey.r; + pgste.gc |= skey.c; + + return pgste; +} + +union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pte old = READ_ONCE(*ptep); + + /* Updating only the software bits while holding the pgste lock. */ + if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) { + WRITE_ONCE(ptep->swbyte, new.swbyte); + return pgste; + } + + if (!old.h.i) { + unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0); + + if (machine_has_tlb_guest()) + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL); + else + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL); + } + + if (uses_skeys) { + if (old.h.i && !new.h.i) + /* Invalid to valid: restore storage keys from PGSTE. */ + dat_set_storage_key_from_pgste(new, pgste); + else if (!old.h.i && new.h.i) + /* Valid to invalid: save storage keys to PGSTE. */ + pgste = dat_save_storage_key_into_pgste(old, pgste); + else if (!old.h.i && !new.h.i) + /* Valid to valid: move storage keys. */ + if (old.h.pfra != new.h.pfra) + dat_move_storage_key(old, new); + /* Invalid to invalid: nothing to do. */ + } + + WRITE_ONCE(*ptep, new); + return pgste; +} + +/* + * dat_split_ste() - Split a segment table entry into page table entries. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: 0 in case of success, -ENOMEM if running out of memory. + */ +static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pgste pgste_init; + struct page_table *pt; + union pmd new, old; + union pte init; + int i; + + BUG_ON(!mc); + old = READ_ONCE(*pmdp); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + pt = dat_alloc_pt_noinit(mc); + if (!pt) + return -ENOMEM; + new.val = virt_to_phys(pt); + + while (old.h.i || old.h.fc) { + init.val = pmd_origin_large(old); + init.h.p = old.h.p; + init.h.i = old.h.i; + init.s.d = old.s.fc1.d; + init.s.w = old.s.fc1.w; + init.s.y = old.s.fc1.y; + init.s.sd = old.s.fc1.sd; + init.s.pr = old.s.fc1.pr; + pgste_init.val = 0; + if (old.h.fc) { + for (i = 0; i < _PAGE_ENTRIES; i++) + pt->ptes[i].val = init.val | i * PAGE_SIZE; + /* No need to take locks as the page table is not installed yet. */ + pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.vsie_notif = old.s.fc1.vsie_notif; + pgste_init.pcl = uses_skeys && init.h.i; + dat_init_pgstes(pt, pgste_init.val); + } else { + dat_init_page_table(pt, init.val, 0); + } + + if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) { + if (!pgste_init.pcl) + return 0; + for (i = 0; i < _PAGE_ENTRIES; i++) { + union pgste pgste = pt->pgstes[i]; + + pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste); + pgste_set_unlock(pt->ptes + i, pgste); + } + return 0; + } + old = READ_ONCE(*pmdp); + } + + dat_free_pt(pt); + return 0; +} + +/* + * dat_split_crste() - Split a crste into smaller crstes. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %0 in case of success, %-ENOMEM if running out of memory. + */ +static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep, + gfn_t gfn, union asce asce, bool uses_skeys) +{ + struct crst_table *table; + union crste old, new, init; + int i; + + old = READ_ONCE(*crstep); + if (is_pmd(old)) + return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys); + + BUG_ON(!mc); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + + new.val = virt_to_phys(table); + new.h.tt = old.h.tt; + new.h.fc0.tl = _REGION_ENTRY_LENGTH; + + while (old.h.i || old.h.fc) { + init = old; + init.h.tt--; + if (old.h.fc) { + for (i = 0; i < _CRST_ENTRIES; i++) + table->crstes[i].val = init.val | i * HPAGE_SIZE; + } else { + crst_table_init((void *)table, init.val); + } + if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*crstep); + } + + dat_free_crst(table); + return 0; +} + +/** + * dat_entry_walk() - Walk the gmap page tables. + * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither + * %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags. + * @gfn: Guest frame. + * @asce: The ASCE of the address space. + * @flags: Flags from WALK_* macros. + * @walk_level: Level to walk to, from LEVEL_* macros. + * @last: Will be filled the last visited non-pte DAT entry. + * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL. + * + * Returns a table entry pointer for the given guest address and @walk_level. + * + * The @flags have the following meanings: + * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries + * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed + * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed + * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered + * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached + * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to + * continue walking to ptes with only DAT_WALK_ANY + * * %DAT_WALK_USES_SKEYS: storage keys are in use + * + * Context: called with kvm->mmu_lock held. + * + * Return: + * * %PGM_ADDRESSING if the requested address lies outside memory + * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC + * * %-EFAULT if the requested address lies inside a memory hole of a different type + * * %-EINVAL if the given ASCE is not compatible with the requested level + * * %-EFBIG if the requested level could not be reached because a larger frame was found + * * %-ENOENT if the requested level could not be reached for other reasons + * * %-ENOMEM if running out of memory while allocating or splitting a table + */ +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp) +{ + union vaddress vaddr = { .addr = gfn_to_gpa(gfn) }; + bool continue_anyway = flags & DAT_WALK_CONTINUE; + bool uses_skeys = flags & DAT_WALK_USES_SKEYS; + bool ign_holes = flags & DAT_WALK_IGN_HOLES; + bool allocate = flags & DAT_WALK_ALLOC; + bool split = flags & DAT_WALK_SPLIT; + bool leaf = flags & DAT_WALK_LEAF; + bool any = flags & DAT_WALK_ANY; + struct page_table *pgtable; + struct crst_table *table; + union crste entry; + int rc; + + *last = NULL; + *ptepp = NULL; + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (WARN_ON_ONCE(unlikely(walk_level > asce.dt))) + return -EINVAL; + if (!asce_contains_gfn(asce, gfn)) + return PGM_ADDRESSING; + + table = dereference_asce(asce); + if (asce.dt >= ASCE_TYPE_REGION1) { + *last = table->crstes + vaddr.rfx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION1) + return 0; + if (entry.pgd.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pgd); + } + + if (asce.dt >= ASCE_TYPE_REGION2) { + *last = table->crstes + vaddr.rsx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION2) + return 0; + if (entry.p4d.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.p4d); + } + + if (asce.dt >= ASCE_TYPE_REGION3) { + *last = table->crstes + vaddr.rtx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION3 && + continue_anyway && !entry.pud.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc)) + return 0; + if (entry.pud.h.i && !entry.pud.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pud); + } + + *last = table->crstes + vaddr.sx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc)) + return 0; + + if (entry.pmd.h.i && !entry.pmd.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + pgtable = dereference_pmd(entry.pmd); + *ptepp = pgtable->ptes + vaddr.px; + if (pte_hole(**ptepp) && !ign_holes) + return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT; + return 0; +} + +static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w) +{ + unsigned int idx = gfn & (_PAGE_ENTRIES - 1); + long rc = 0; + + for ( ; gfn < end; idx++, gfn++) { + if (pte_hole(READ_ONCE(table->ptes[idx]))) { + if (!(w->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(w->flags & DAT_WALK_ANY)) + continue; + } + + rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w); + if (rc) + break; + } + return rc; +} + +static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table, + struct dat_walk *walk) +{ + unsigned long idx, cur_shift, cur_size; + dat_walk_op the_op; + union crste crste; + gfn_t cur, next; + long rc = 0; + + cur_shift = 8 + table->crstes[0].h.tt * 11; + idx = (start >> cur_shift) & (_CRST_ENTRIES - 1); + cur_size = 1UL << cur_shift; + + for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) { + next = cur + cur_size; + walk->last = table->crstes + idx; + crste = READ_ONCE(*walk->last); + + if (crste_hole(crste)) { + if (!(walk->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(walk->flags & DAT_WALK_ANY)) + continue; + } + + the_op = walk->ops->crste_ops[crste.h.tt]; + if (the_op) { + rc = the_op(walk->last, cur, next, walk); + crste = READ_ONCE(*walk->last); + } + if (rc) + break; + if (!crste.h.i && !crste.h.fc) { + if (!is_pmd(crste)) + rc = dat_crste_walk_range(max(start, cur), min(end, next), + _dereference_crste(crste), walk); + else if (walk->ops->pte_entry) + rc = dat_pte_walk_range(max(start, cur), min(end, next), + dereference_pmd(crste.pmd), walk); + } + } + return rc; +} + +/** + * _dat_walk_gfn_range() - Walk DAT tables. + * @start: The first guest page frame to walk. + * @end: The guest page frame immediately after the last one to walk. + * @asce: The ASCE of the guest mapping. + * @ops: The gmap_walk_ops that will be used to perform the walk. + * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported). + * @priv: Will be passed as-is to the callbacks. + * + * Any callback returning non-zero causes the walk to stop immediately. + * + * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the + * given ASCE unless the DAT_WALK_IGN_HOLES flag is specified, + * otherwise it returns whatever the callbacks return. + */ +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv) +{ + struct crst_table *table = dereference_asce(asce); + struct dat_walk walk = { + .ops = ops, + .asce = asce, + .priv = priv, + .flags = flags, + .start = start, + .end = end, + }; + + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (!asce_contains_gfn(asce, start)) + return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT; + + return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); +} + +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int rc; + + skey->skey = 0; + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste; + + crste = READ_ONCE(*crstep); + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn)); + return 0; + } + pgste = pgste_get_lock(ptep); + if (ptep->h.i) { + skey->acc = pgste.acc; + skey->fp = pgste.fp; + } else { + skey->skey = page_get_storage_key(pte_origin(*ptep)); + } + skey->r |= pgste.gr; + skey->c |= pgste.gc; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep) +{ + if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc) + __atomic64_or(_PAGE_SD, &ptep->val); +} + +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq); + return 0; + } + + old = pgste_get_lock(ptep); + pgste = old; + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + union skey old_skey; + + old_skey.skey = page_get_storage_key(pte_origin(*ptep)); + pgste.hc |= old_skey.c; + pgste.hr |= old_skey.r; + old_skey.c = old.gc; + old_skey.r = old.gr; + skey.r = 0; + skey.c = 0; + page_set_storage_key(pte_origin(*ptep), skey.skey, !nq); + } + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey, + bool nq, bool mr, bool mc) +{ + oldkey->skey = page_get_storage_key(paddr); + if (oldkey->acc == skey.acc && oldkey->fp == skey.fp && + (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc)) + return false; + page_set_storage_key(paddr, skey.skey, !nq); + return true; +} + +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc) +{ + union pgste pgste, old; + union crste *crstep; + union skey prev; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) + return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey, + nq, mr, mc); + + old = pgste_get_lock(ptep); + pgste = old; + + rc = 1; + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc); + pgste.hc |= prev.c; + pgste.hr |= prev.r; + prev.c |= old.gc; + prev.r |= old.gr; + } else { + prev.acc = old.acc; + prev.fp = old.fp; + prev.c = old.gc; + prev.r = old.gr; + } + if (oldkey) + *oldkey = prev; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +int dat_reset_reference_bit(union asce asce, gfn_t gfn) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste = READ_ONCE(*crstep); + + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + return page_reset_referenced(large_crste_to_phys(*crstep, gfn)); + } + old = pgste_get_lock(ptep); + pgste = old; + + if (!ptep->h.i) { + rc = page_reset_referenced(pte_origin(*ptep)); + pgste.hr = rc >> 1; + } + rc |= (pgste.gr << 1) | pgste.gc; + pgste.gr = 0; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.acc = 0; + pgste.fp = 0; + pgste.gr = 0; + pgste.gc = 0; + if (ptep->s.pr) + page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); + + if (need_resched()) + return next; + return 0; +} + +static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t addr, end, origin = crste_origin_large(*crstep); + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end) + addr = sske_frame(addr, PAGE_DEFAULT_KEY); + for ( ; addr < end; addr += PAGE_SIZE) + page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1); + + if (need_resched()) + return next; + return 0; +} + +long dat_reset_skeys(union asce asce, gfn_t start) +{ + const struct dat_walk_ops ops = { + .pte_entry = dat_reset_skeys_pte, + .pmd_entry = dat_reset_skeys_crste, + .pud_entry = dat_reset_skeys_crste, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL); +} + +struct slot_priv { + unsigned long token; + struct kvm_s390_mmu_cache *mc; +}; + +static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct slot_priv *p = walk->priv; + union crste dummy = { .val = p->token }; + union pte new_pte, pte = READ_ONCE(*ptep); + + new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par); + + /* Table entry already in the desired state. */ + if (pte.val == new_pte.val) + return 0; + + dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false); + return 0; +} + +static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste new_crste, crste = READ_ONCE(*crstep); + struct slot_priv *p = walk->priv; + + new_crste.val = p->token; + new_crste.h.tt = crste.h.tt; + + /* Table entry already in the desired state. */ + if (crste.val == new_crste.val) + return 0; + + /* This table entry needs to be updated. */ + if (walk->start <= gfn && walk->end >= next) { + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) + return -EINVAL; + /* A lower level table was present, needs to be freed. */ + if (!crste.h.fc && !crste.h.i) { + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); + } + return 0; + } + + /* A lower level table is present, things will handled there. */ + if (!crste.h.fc && !crste.h.i) + return 0; + /* Split (install a lower level table), and handle things there. */ + return dat_split_crste(p->mc, crstep, gfn, walk->asce, false); +} + +static const struct dat_walk_ops dat_slot_ops = { + .pte_entry = _dat_slot_pte, + .crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, }, +}; + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param) +{ + struct slot_priv priv = { + .token = _CRSTE_TOK(0, type, param).val, + .mc = mc, + }; + + return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops, + DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv); +} + +static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgstes[i].pcl) + break; + pgste_set_unlock(first + i, pgstes[i]); + } +} + +static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgste_get_trylock(first + i, pgstes + i)) + break; + } + if (i == n) + return true; + pgste_set_unlock_multiple(first, n, pgstes); + return false; +} + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param) +{ + union pgste pgstes[4] = {}; + unsigned long res = 0; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = 0; i < n; i++) + res = res << 16 | pgstes[i].val16; + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); + return res; +} + +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val) +{ + union pgste pgstes[4] = {}; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = param.len; i >= 0; i--) { + pgstes[i].val16 = val; + val = val >> 16; + } + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); +} + +static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk) +{ + return ptep->s.y; +} + +static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end, + struct dat_walk *walk) +{ + return crstep->h.fc && crstep->s.fc1.y; +} + +static const struct dat_walk_ops test_age_ops = { + .pte_entry = _dat_test_young_pte, + .pmd_entry = _dat_test_young_crste, + .pud_entry = _dat_test_young_crste, +}; + +/** + * dat_test_age_gfn() - Test young. + * @asce: The ASCE whose address range is to be tested. + * @start: The first guest frame of the range to check. + * @end: The guest frame after the last in the range. + * + * Context: called by KVM common code with the kvm mmu write lock held. + * + * Return: %true if any page in the given range is young, otherwise %false. + */ +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) +{ + return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; +} + +static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste newcrste, oldcrste; + int *n = walk->priv; + + do { + oldcrste = READ_ONCE(*crstep); + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) + return 0; + if (oldcrste.s.fc1.prefix_notif) + break; + newcrste = oldcrste; + newcrste.s.fc1.prefix_notif = 1; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); + *n = 2; + return 0; +} + +static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + int *n = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (!ptep->h.i && !ptep->h.p) { + pgste.prefix_notif = 1; + *n += 1; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn) +{ + static const struct dat_walk_ops ops = { + .pte_entry = dat_set_pn_pte, + .pmd_entry = dat_set_pn_crste, + .pud_entry = dat_set_pn_crste, + }; + + int n = 0; + + _dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n); + if (n != 2) + return -EAGAIN; + return 0; +} + +/** + * dat_perform_essa() - Perform ESSA actions on the PGSTE. + * @asce: The asce to operate on. + * @gfn: The guest page frame to operate on. + * @orc: The specific action to perform, see the ESSA_SET_* macros. + * @state: The storage attributes to be returned to the guest. + * @dirty: Returns whether the function dirtied a previously clean entry. + * + * Context: Called with kvm->mmu_lock held. + * + * Return: + * * %1 if the page state has been altered and the page is to be added to the CBRL + * * %0 if the page state has been altered, but the page is not to be added to the CBRL + * * %-1 if the page state has not been altered and the page is not to be added to the CBRL + */ +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int res = 0; + + if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) { + *state = (union essa_state) { .exception = 1 }; + return -1; + } + + pgste = pgste_get_lock(ptep); + + *state = (union essa_state) { + .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero), + .nodat = pgste.nodat, + .usage = pgste.usage, + }; + + switch (orc) { + case ESSA_GET_STATE: + res = -1; + break; + case ESSA_SET_STABLE: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 0; + break; + case ESSA_SET_UNUSED: + pgste.usage = PGSTE_GPS_USAGE_UNUSED; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + if (!ptep->h.i) { + pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE; + } else if (pgste.zero) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + } else if (!pgste.gc) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + res = 1; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!ptep->h.i) + pgste.usage = PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_STABLE_NODAT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 1; + break; + default: + WARN_ONCE(1, "Invalid ORC!"); + res = -1; + break; + } + /* If we are discarding a page, set it to logical zero. */ + pgste.zero = res == 1; + if (orc > 0) { + *dirty = !pgste.cmma_d; + pgste.cmma_d = 1; + } + + pgste_set_unlock(ptep, pgste); + + return res; +} + +static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.usage = 0; + pgste.nodat = 0; + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + if (need_resched()) + return next; + return 0; +} + +long dat_reset_cmma(union asce asce, gfn_t start) +{ + const struct dat_walk_ops dat_reset_cmma_ops = { + .pte_entry = dat_reset_cmma_pte, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops, + DAT_WALK_IGN_HOLES, NULL); +} + +struct dat_get_cmma_state { + gfn_t start; + gfn_t end; + unsigned int count; + u8 *values; + atomic64_t *remaining; +}; + +static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6); + pgste_set_unlock(ptep, pgste); + state->end = next; + + return 0; +} + +static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + + if (crstep->h.i) + state->end = min(walk->end, next); + return 0; +} + +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values) +{ + const struct dat_walk_ops ops = { + .pte_entry = __dat_peek_cmma_pte, + .pmd_entry = __dat_peek_cmma_crste, + .pud_entry = __dat_peek_cmma_crste, + .p4d_entry = __dat_peek_cmma_crste, + .pgd_entry = __dat_peek_cmma_crste, + }; + struct dat_get_cmma_state state = { .values = values, }; + int rc; + + rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state); + *count = state.end - start; + /* Return success if at least one value was saved, otherwise an error. */ + return (rc == -EFAULT && *count > 0) ? 0 : rc; +} + +static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + if (state->start != -1) { + if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE) + return 1; + if (gfn - state->start >= state->count) + return 1; + } + + if (!READ_ONCE(*pgste_of(ptep)).cmma_d) + return 0; + + pgste = pgste_get_lock(ptep); + if (pgste.cmma_d) { + if (state->start == -1) + state->start = gfn; + pgste.cmma_d = 0; + atomic64_dec(state->remaining); + state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6; + state->end = next; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, }; + struct dat_get_cmma_state state = { + .remaining = rem, + .values = values, + .count = *count, + .start = -1, + }; + + _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + + if (state.start == -1) { + *count = 0; + } else { + *count = state.end - state.start; + *start = state.start; + } + + return 0; +} + +struct dat_set_cmma_state { + unsigned long mask; + const u8 *bits; +}; + +static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_set_cmma_state *state = walk->priv; + union pgste pgste, tmp; + + tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask; + + pgste = pgste_get_lock(ptep); + pgste.usage = tmp.usage; + pgste.nodat = tmp.nodat; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +/** + * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages. + * @mc: Cache used for allocations. + * @asce: The ASCE of the guest. + * @gfn: The guest frame of the fist page whose CMMA bits are to set. + * @count: How many pages need to be processed. + * @mask: Which PGSTE bits should be set. + * @bits: Points to an array with the CMMA attributes. + * + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.uses_cmm flag is set. + * + * Each byte in @bits contains new values for bits 32-39 of the PGSTE. + * Currently, only the fields NT and US are applied. + * + * Return: %0 in case of success, a negative error value otherwise. + */ +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, }; + struct dat_set_cmma_state state = { .mask = mask, .bits = bits, }; + union crste *crstep; + union pte *ptep; + gfn_t cur; + int rc; + + for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) { + rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + } + return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h new file mode 100644 index 000000000000..8f8278c44879 --- /dev/null +++ b/arch/s390/kvm/dat.h @@ -0,0 +1,976 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2024, 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef __KVM_S390_DAT_H +#define __KVM_S390_DAT_H + +#include <linux/radix-tree.h> +#include <linux/refcount.h> +#include <linux/io.h> +#include <linux/kvm_types.h> +#include <linux/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/dat-bits.h> + +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* For consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +#define _ASCE(x) ((union asce) { .val = (x), }) +#define NULL_ASCE _ASCE(0) + +enum { + _DAT_TOKEN_NONE = 0, + _DAT_TOKEN_PIC, +}; + +#define _CRSTE_TOK(l, t, p) ((union crste) { \ + .tok.i = 1, \ + .tok.tt = (l), \ + .tok.type = (t), \ + .tok.par = (p) \ + }) +#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p) + +#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING) +#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0) + +#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT) + +#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) }) +#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0) + +/* This fake table type is used for page table walks (both for normal page tables and vSIE) */ +#define TABLE_TYPE_PAGE_TABLE -1 + +enum dat_walk_flags { + DAT_WALK_USES_SKEYS = 0x40, + DAT_WALK_CONTINUE = 0x20, + DAT_WALK_IGN_HOLES = 0x10, + DAT_WALK_SPLIT = 0x08, + DAT_WALK_ALLOC = 0x04, + DAT_WALK_ANY = 0x02, + DAT_WALK_LEAF = 0x01, + DAT_WALK_DEFAULT = 0 +}; + +#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC) +#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC) +#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC) + +union pte { + unsigned long val; + union page_table_entry h; + struct { + unsigned long :56; /* Hardware bits */ + unsigned long u : 1; /* Page unused */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long sd: 1; /* Soft dirty */ + unsigned long pr: 1; /* Present */ + } s; + struct { + unsigned char hwbytes[7]; + unsigned char swbyte; + }; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :20; + unsigned long : 1; /* Must be 0 */ + unsigned long i : 1; /* Must be 1 */ + unsigned long : 2; + unsigned long : 7; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; +}; + +#define _SEGMENT_FR_MASK (_SEGMENT_MASK >> PAGE_SHIFT) +#define _REGION3_FR_MASK (_REGION3_MASK >> PAGE_SHIFT) +#define _PAGES_PER_SEGMENT _PAGE_ENTRIES +#define _PAGES_PER_REGION3 (_PAGES_PER_SEGMENT * _CRST_ENTRIES) + +/* Soft dirty, needed as macro for atomic operations on ptes */ +#define _PAGE_SD 0x002 + +/* Needed as macro to perform atomic operations */ +#define PGSTE_PCL_BIT 0x0080000000000000UL /* PCL lock, HW bit */ +#define PGSTE_CMMA_D_BIT 0x0000000000008000UL /* CMMA dirty soft-bit */ + +enum pgste_gps_usage { + PGSTE_GPS_USAGE_STABLE = 0, + PGSTE_GPS_USAGE_UNUSED, + PGSTE_GPS_USAGE_POT_VOLATILE, + PGSTE_GPS_USAGE_VOLATILE, +}; + +union pgste { + unsigned long val; + struct { + unsigned long acc : 4; + unsigned long fp : 1; + unsigned long : 3; + unsigned long pcl : 1; + unsigned long hr : 1; + unsigned long hc : 1; + unsigned long : 2; + unsigned long gr : 1; + unsigned long gc : 1; + unsigned long : 1; + unsigned long :16; /* val16 */ + unsigned long zero : 1; + unsigned long nodat : 1; + unsigned long : 4; + unsigned long usage : 2; + unsigned long : 8; + unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 5; + unsigned long : 8; + }; + struct { + unsigned short hwbytes0; + unsigned short val16; /* Used to store chunked values, see dat_{s,g}et_ptval() */ + unsigned short hwbytes4; + unsigned char flags; /* Maps to the software bits */ + unsigned char hwbyte7; + } __packed; +}; + +union pmd { + unsigned long val; + union segment_table_entry h; + struct { + struct { + unsigned long :44; /* HW */ + unsigned long : 3; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union pud { + unsigned long val; + union region3_table_entry h; + struct { + struct { + unsigned long :33; /* HW */ + unsigned long :14; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union p4d { + unsigned long val; + union region2_table_entry h; +}; + +union pgd { + unsigned long val; + union region1_table_entry h; +}; + +union crste { + unsigned long val; + union { + struct { + unsigned long :52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long : 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long : 2; + }; + struct { + unsigned long to:52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long tf: 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long tl: 2; + } fc0; + struct { + unsigned long :47; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc: 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; + } fc1; + } h; + struct { + struct { + unsigned long :47; + unsigned long : 1; /* HW (should be 0) */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :26; + unsigned long i : 1; /* Must be 1 */ + unsigned long : 1; + unsigned long tt : 2; + unsigned long : 1; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; + union pmd pmd; + union pud pud; + union p4d p4d; + union pgd pgd; +}; + +union skey { + unsigned char skey; + struct { + unsigned char acc :4; + unsigned char fp :1; + unsigned char r :1; + unsigned char c :1; + unsigned char zero:1; + }; +}; + +static_assert(sizeof(union pgste) == sizeof(unsigned long)); +static_assert(sizeof(union pte) == sizeof(unsigned long)); +static_assert(sizeof(union pmd) == sizeof(unsigned long)); +static_assert(sizeof(union pud) == sizeof(unsigned long)); +static_assert(sizeof(union p4d) == sizeof(unsigned long)); +static_assert(sizeof(union pgd) == sizeof(unsigned long)); +static_assert(sizeof(union crste) == sizeof(unsigned long)); +static_assert(sizeof(union skey) == sizeof(char)); + +struct segment_table { + union pmd pmds[_CRST_ENTRIES]; +}; + +struct region3_table { + union pud puds[_CRST_ENTRIES]; +}; + +struct region2_table { + union p4d p4ds[_CRST_ENTRIES]; +}; + +struct region1_table { + union pgd pgds[_CRST_ENTRIES]; +}; + +struct crst_table { + union { + union crste crstes[_CRST_ENTRIES]; + struct segment_table segment; + struct region3_table region3; + struct region2_table region2; + struct region1_table region1; + }; +}; + +struct page_table { + union pte ptes[_PAGE_ENTRIES]; + union pgste pgstes[_PAGE_ENTRIES]; +}; + +static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); +static_assert(sizeof(struct page_table) == PAGE_SIZE); + +struct dat_walk; + +typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w); + +struct dat_walk_ops { + union { + dat_walk_op crste_ops[4]; + struct { + dat_walk_op pmd_entry; + dat_walk_op pud_entry; + dat_walk_op p4d_entry; + dat_walk_op pgd_entry; + }; + }; + long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w); +}; + +struct dat_walk { + const struct dat_walk_ops *ops; + union crste *last; + union pte *last_pte; + union asce asce; + gfn_t start; + gfn_t end; + int flags; + void *priv; +}; + +struct ptval_param { + unsigned char offset : 6; + unsigned char len : 2; +}; + +/** + * _pte() - Useful constructor for union pte + * @pfn: the pfn this pte should point to. + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * @special: whether the pte should be marked as special + * + * The pte is also marked as young and present. If the pte is marked as dirty, + * it gets marked as soft-dirty too. If the pte is not dirty, the hardware + * protect bit is set (independently of the write softbit); this way proper + * dirty tracking can be performed. + * + * Return: a union pte value. + */ +static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special) +{ + union pte res = { .val = PFN_PHYS(pfn) }; + + res.h.p = !dirty; + res.s.y = 1; + res.s.pr = 1; + res.s.w = writable; + res.s.d = dirty; + res.s.sd = dirty; + res.s.s = special; + return res; +} + +static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt) +{ + union crste res = { .val = PFN_PHYS(pfn) }; + + res.h.tt = tt; + res.h.fc0.tl = _REGION_ENTRY_LENGTH; + res.h.fc0.tf = 0; + return res; +} + +/** + * _crste() - Useful constructor for union crste with FC=1 + * @pfn: the pfn this pte should point to. + * @tt: the table type + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * + * The crste is also marked as young and present. If the crste is marked as + * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the + * hardware protect bit is set (independently of the write softbit); this way + * proper dirty tracking can be performed. + * + * Return: a union crste value. + */ +static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty) +{ + union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK }; + + res.h.tt = tt; + res.h.p = !dirty; + res.h.fc = 1; + res.s.fc1.y = 1; + res.s.fc1.pr = 1; + res.s.fc1.w = writable; + res.s.fc1.d = dirty; + res.s.fc1.sd = dirty; + return res; +} + +union essa_state { + unsigned char val; + struct { + unsigned char : 2; + unsigned char nodat : 1; + unsigned char exception : 1; + unsigned char usage : 2; + unsigned char content : 2; + }; +}; + +/** + * struct vsie_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @r_gfn: virtual rmap address in the shadow guest address space + */ +struct vsie_rmap { + struct vsie_rmap *next; + union { + unsigned long val; + struct { + long level: 8; + unsigned long : 4; + unsigned long r_gfn:52; + }; + }; +}; + +static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long)); + +#define KVM_S390_MMU_CACHE_N_CRSTS 6 +#define KVM_S390_MMU_CACHE_N_PTS 2 +#define KVM_S390_MMU_CACHE_N_RMAPS 16 +struct kvm_s390_mmu_cache { + void *crsts[KVM_S390_MMU_CACHE_N_CRSTS]; + void *pts[KVM_S390_MMU_CACHE_N_PTS]; + void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS]; + short int n_crsts; + short int n_pts; + short int n_rmaps; +}; + +struct guest_fault { + gfn_t gfn; /* Guest frame */ + kvm_pfn_t pfn; /* Host PFN */ + struct page *page; /* Host page */ + union pte *ptep; /* Used to resolve the fault, or NULL */ + union crste *crstep; /* Used to resolve the fault, or NULL */ + bool writable; /* Mapping is writable */ + bool write_attempt; /* Write access attempted */ + bool attempt_pfault; /* Attempt a pfault first */ + bool valid; /* This entry contains valid data */ + void (*callback)(struct guest_fault *f); + void *priv; +}; + +/* + * 0 1 2 3 4 5 6 7 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | | PGT_ADDR | + * 8 | VMADDR | | + * 16 | | + * 24 | | + */ +#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1}) +#define PTVAL_PGT_ADDR MKPTVAL(4, 8) +#define PTVAL_VMADDR MKPTVAL(8, 6) + +union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, + gfn_t gfn, union asce asce, bool uses_skeys); +bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce); +void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); + +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv); + +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp); +void dat_free_level(struct crst_table *table, bool owns_ptes); +struct crst_table *dat_alloc_crst_sleepable(unsigned long init); +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype); +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey); +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq); +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc); +int dat_reset_reference_bit(union asce asce, gfn_t gfn); +long dat_reset_skeys(union asce asce, gfn_t start); + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param); +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val); + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param); +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); + +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); +long dat_reset_cmma(union asce asce, gfn_t start_gfn); +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values); +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem); +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits); + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); + +#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN) + +static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_pts) + return mc->pts[--mc->n_pts]; + return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_crsts) + return mc->crsts[--mc->n_crsts]; + return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER); +} + +static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_rmaps) + return mc->rmaps[--mc->n_rmaps]; + return kzalloc_obj(struct vsie_rmap, GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *crste_table_start(union crste *crstep) +{ + return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE); +} + +static inline struct page_table *pte_table_start(union pte *ptep) +{ + return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE); +} + +static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce) +{ + unsigned long dtt = 0x10 | new.h.tt << 2; + void *table = crste_table_start(crstep); + + return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val); +} + +/** + * idte_crste() - invalidate a crste entry using idte + * @crstep: pointer to the crste to be invalidated + * @gfn: a gfn mapped by the crste + * @opt: options for the idte instruction + * @asce: the asce + * @local: whether the operation is cpu-local + */ +static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt, + union asce asce, int local) +{ + unsigned long table_origin = __pa(crste_table_start(crstep)); + unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK; + + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile("idte %[table_origin],0,%[gaddr],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr] "a" (gaddr), + [local] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt), + [asce] "a" (asce.val), [local] "i" (local) + : "cc"); + } +} + +static inline void dat_init_pgstes(struct page_table *pt, unsigned long val) +{ + memset64((void *)pt->pgstes, val, PTRS_PER_PTE); +} + +static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes, + unsigned long pgstes) +{ + memset64((void *)pt->ptes, ptes, PTRS_PER_PTE); + dat_init_pgstes(pt, pgstes); +} + +static inline gfn_t asce_end(union asce asce) +{ + return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT); +} + +#define _CRSTE(x) ((union crste) { .val = _Generic((x), \ + union pgd : (x).val, \ + union p4d : (x).val, \ + union pud : (x).val, \ + union pmd : (x).val, \ + union crste : (x).val)}) + +#define _CRSTEP(x) ((union crste *)_Generic((*(x)), \ + union pgd : (x), \ + union p4d : (x), \ + union pud : (x), \ + union pmd : (x), \ + union crste : (x))) + +#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)), \ + struct crst_table : (x), \ + struct segment_table : (x), \ + struct region3_table : (x), \ + struct region2_table : (x), \ + struct region1_table : (x))) + +static inline bool asce_contains_gfn(union asce asce, gfn_t gfn) +{ + return gfn < asce_end(asce); +} + +static inline bool is_pmd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_SEGMENT; +} + +static inline bool is_pud(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION3; +} + +static inline bool is_p4d(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION2; +} + +static inline bool is_pgd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION1; +} + +static inline phys_addr_t pmd_origin_large(union pmd pmd) +{ + return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE; +} + +static inline phys_addr_t pud_origin_large(union pud pud) +{ + return pud.val & _REGION3_ENTRY_ORIGIN_LARGE; +} + +/** + * crste_origin_large() - Return the large frame origin of a large crste + * @crste: The crste whose origin is to be returned. Should be either a + * region-3 table entry or a segment table entry, in both cases with + * FC set to 1 (large pages). + * + * Return: The origin of the large frame pointed to by @crste, or -1 if the + * crste was not large (wrong table type, or FC==0) + */ +static inline phys_addr_t crste_origin_large(union crste crste) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return pmd_origin_large(crste.pmd); + return pud_origin_large(crste.pud); +} + +#define crste_origin(x) (_Generic((x), \ + union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN, \ + union pud : (x).val & _REGION_ENTRY_ORIGIN, \ + union p4d : (x).val & _REGION_ENTRY_ORIGIN, \ + union pgd : (x).val & _REGION_ENTRY_ORIGIN)) + +static inline unsigned long pte_origin(union pte pte) +{ + return pte.val & PAGE_MASK; +} + +static inline bool pmd_prefix(union pmd pmd) +{ + return pmd.h.fc && pmd.s.fc1.prefix_notif; +} + +static inline bool pud_prefix(union pud pud) +{ + return pud.h.fc && pud.s.fc1.prefix_notif; +} + +static inline bool crste_leaf(union crste crste) +{ + return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc; +} + +static inline bool crste_prefix(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.prefix_notif; +} + +static inline bool crste_dirty(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.d; +} + +static inline union pgste *pgste_of(union pte *pte) +{ + return (union pgste *)(pte + _PAGE_ENTRIES); +} + +static inline bool pte_hole(union pte pte) +{ + return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE; +} + +static inline bool _crste_hole(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE; +} + +#define crste_hole(x) _crste_hole(_CRSTE(x)) + +static inline bool _crste_none(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE; +} + +#define crste_none(x) _crste_none(_CRSTE(x)) + +static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn) +{ + return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK); +} + +static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn) +{ + return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK); +} + +static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return large_pmd_to_phys(crste.pmd, gfn); + return large_pud_to_phys(crste.pud, gfn); +} + +static inline bool cspg_crste(union crste *crstep, union crste old, union crste new) +{ + return cspg(&crstep->val, old.val, new.val); +} + +static inline struct page_table *dereference_pmd(union pmd pmd) +{ + return phys_to_virt(crste_origin(pmd)); +} + +static inline struct segment_table *dereference_pud(union pud pud) +{ + return phys_to_virt(crste_origin(pud)); +} + +static inline struct region3_table *dereference_p4d(union p4d p4d) +{ + return phys_to_virt(crste_origin(p4d)); +} + +static inline struct region2_table *dereference_pgd(union pgd pgd) +{ + return phys_to_virt(crste_origin(pgd)); +} + +static inline struct crst_table *_dereference_crste(union crste crste) +{ + if (unlikely(is_pmd(crste))) + return NULL; + return phys_to_virt(crste_origin(crste.pud)); +} + +#define dereference_crste(x) (_Generic((x), \ + union pud : _dereference_crste(_CRSTE(x)), \ + union p4d : _dereference_crste(_CRSTE(x)), \ + union pgd : _dereference_crste(_CRSTE(x)), \ + union crste : _dereference_crste(_CRSTE(x)))) + +static inline struct crst_table *dereference_asce(union asce asce) +{ + return phys_to_virt(asce.val & _ASCE_ORIGIN); +} + +static inline void asce_flush_tlb(union asce asce) +{ + __tlb_flush_idte(asce.val); +} + +static inline bool pgste_get_trylock(union pte *ptep, union pgste *res) +{ + union pgste *pgstep = pgste_of(ptep); + union pgste old_pgste; + + if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT) + return false; + old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val); + if (old_pgste.pcl) + return false; + old_pgste.pcl = 1; + *res = old_pgste; + return true; +} + +static inline union pgste pgste_get_lock(union pte *ptep) +{ + union pgste res; + + while (!pgste_get_trylock(ptep, &res)) + cpu_relax(); + return res; +} + +static inline void pgste_set_unlock(union pte *ptep, union pgste pgste) +{ + pgste.pcl = 0; + barrier(); + WRITE_ONCE(*pgste_of(ptep), pgste); +} + +static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce, + bool has_skeys) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys); + pgste_set_unlock(ptep, pgste); +} + +static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys) +{ + dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys); +} + +static inline void dat_free_pt(struct page_table *pt) +{ + free_page((unsigned long)pt); +} + +static inline void _dat_free_crst(struct crst_table *table) +{ + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + +#define dat_free_crst(x) _dat_free_crst(_CRSTP(x)) + +static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc) +{ + if (!mc) + return; + while (mc->n_pts) + dat_free_pt(mc->pts[--mc->n_pts]); + while (mc->n_crsts) + _dat_free_crst(mc->crsts[--mc->n_crsts]); + while (mc->n_rmaps) + kfree(mc->rmaps[--mc->n_rmaps]); + kfree(mc); +} + +DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T)) + +static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void) +{ + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; + + mc = kzalloc_obj(*mc, GFP_KERNEL_ACCOUNT); + if (mc && !kvm_s390_mmu_cache_topup(mc)) + return_ptr(mc); + return NULL; +} + +static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) +{ + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); + + do { + oldcrste = READ_ONCE(*crstep); + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); + return oldcrste; +} + +static inline int get_level(union crste *crstep, union pte *ptep) +{ + return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt; +} + +static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING); +} + +static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0); +} + +static inline bool crste_is_ucas(union crste crste) +{ + return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0; +} + +#endif /* __KVM_S390_DAT_H */ diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 74f73141f9b9..d89d1c381522 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -10,12 +10,30 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> -#include <asm/gmap.h> +#include <asm/gmap_helpers.h> #include <asm/virtio-ccw.h> #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" #include "gaccess.h" +#include "gmap.h" + +static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslot_iter iter; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned long start, end; + + slots = kvm_vcpu_memslots(vcpu); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + slot = iter.slot; + start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn)); + end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages)); + gmap_helper_discard(vcpu->kvm->mm, start, end); + } +} static int diag_release_pages(struct kvm_vcpu *vcpu) { @@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); + mmap_read_lock(vcpu->kvm->mm); /* * We checked for start >= end above, so lets check for the * fast path (no prefix swap page involved) */ if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) { - gmap_discard(vcpu->arch.gmap, start, end); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end)); } else { /* * This is slow path. gmap_discard will check for start @@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) * prefix and let gmap_discard make some of these calls * NOPs. */ - gmap_discard(vcpu->arch.gmap, start, prefix); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix)); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + do_discard_gfn_range(vcpu, 0, 1); if (end > prefix + PAGE_SIZE) - gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); - gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); + do_discard_gfn_range(vcpu, 1, 2); + do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end)); } + mmap_read_unlock(vcpu->kvm->mm); return 0; } diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c new file mode 100644 index 000000000000..ddf0ca71f374 --- /dev/null +++ b/arch/s390/kvm/faultin.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ +#include <linux/kvm_types.h> +#include <linux/kvm_host.h> + +#include "gmap.h" +#include "trace.h" +#include "faultin.h" + +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); + +/* + * kvm_s390_faultin_gfn() - handle a dat fault. + * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM. + * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU. + * @f: The guest fault that needs to be resolved. + * + * Return: + * * 0 on success + * * < 0 in case of error + * * > 0 in case of guest exceptions + * + * Context: + * * The mm lock must not be held before calling + * * kvm->srcu must be held + * * may sleep + */ +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f) +{ + struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL; + struct kvm_s390_mmu_cache *mc = NULL; + struct kvm_memory_slot *slot; + unsigned long inv_seq; + int foll, rc = 0; + + foll = f->write_attempt ? FOLL_WRITE : 0; + foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; + + if (vcpu) { + kvm = vcpu->kvm; + mc = vcpu->arch.mc; + } + + lockdep_assert_held(&kvm->srcu); + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0) + return 0; + } + + while (1) { + f->valid = false; + inv_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + if (vcpu) + slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn); + else + slot = gfn_to_memslot(kvm, f->gfn); + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */ + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) { + if (unlikely(!f->attempt_pfault)) + return -EAGAIN; + if (unlikely(!vcpu)) + return -EINVAL; + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously. */ + foll &= ~FOLL_NOWAIT; + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + } + + /* Access outside memory, addressing exception. */ + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + /* Signal pending: try again. */ + if (f->pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + /* Check if it's read-only memory; don't try to actually handle that case. */ + if (f->pfn == KVM_PFN_ERR_RO_FAULT) + return -EOPNOTSUPP; + /* Any other error. */ + if (is_error_pfn(f->pfn)) + return -EFAULT; + + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + /* Loop, will automatically release the faulted page. */ + if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { + kvm_release_faultin_page(kvm, f->page, true, false); + continue; + } + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { + f->valid = true; + rc = gmap_link(mc, kvm->arch.gmap, f, slot); + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); + f->page = NULL; + } + } + kvm_release_faultin_page(kvm, f->page, true, false); + + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } else if (rc != -EAGAIN) { + return rc; + } + } +} + +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int foll = w ? FOLL_WRITE : 0; + + f->write_attempt = w; + f->gfn = gfn; + f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page); + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + if (is_sigpending_pfn(f->pfn)) + return -EINTR; + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) + return -EAGAIN; + if (is_error_pfn(f->pfn)) + return -EFAULT; + + f->valid = true; + return 0; +} diff --git a/arch/s390/kvm/faultin.h b/arch/s390/kvm/faultin.h new file mode 100644 index 000000000000..f86176d2769c --- /dev/null +++ b/arch/s390/kvm/faultin.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef __KVM_S390_FAULTIN_H +#define __KVM_S390_FAULTIN_H + +#include <linux/kvm_host.h> + +#include "dat.h" + +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f); +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w); + +static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm, + gfn_t gfn, bool wr) +{ + struct guest_fault f = { .gfn = gfn, .write_attempt = wr, }; + + return kvm_s390_faultin_gfn(vcpu, kvm, &f); +} + +static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f, + gpa_t gaddr, unsigned long *val) +{ + int rc; + + rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false); + if (rc) + return rc; + + *val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr)); + + return 0; +} + +static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults, + int n, bool ignore) +{ + int i; + + for (i = 0; i < n; i++) { + kvm_release_faultin_page(kvm, guest_faults[i].page, ignore, + guest_faults[i].write_attempt); + guest_faults[i].page = NULL; + } +} + +static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq, + struct guest_fault *guest_faults, int n, + bool unsafe) +{ + int i; + + for (i = 0; i < n; i++) { + if (!guest_faults[i].valid) + continue; + if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn)) + return true; + if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn)) + return true; + } + return false; +} + +static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults, + gfn_t start, int n_pages, bool write_attempt) +{ + int i, rc; + + for (i = 0; i < n_pages; i++) { + rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt); + if (rc) + break; + } + return rc; +} + +#define kvm_s390_release_faultin_array(kvm, array, ignore) \ + kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore) + +#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true) + +#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false) + +#endif /* __KVM_S390_FAULTIN_H */ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index f6fded15633a..b07accd19618 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -11,40 +11,43 @@ #include <linux/err.h> #include <linux/pgtable.h> #include <linux/bitfield.h> +#include <linux/kvm_host.h> +#include <linux/kvm_types.h> +#include <asm/diag.h> #include <asm/access-regs.h> #include <asm/fault.h> -#include <asm/gmap.h> #include <asm/dat-bits.h> #include "kvm-s390.h" +#include "dat.h" #include "gmap.h" #include "gaccess.h" +#include "faultin.h" -/* - * vaddress union in order to easily decode a virtual address into its - * region first index, region second index etc. parts. - */ -union vaddress { - unsigned long addr; - struct { - unsigned long rfx : 11; - unsigned long rsx : 11; - unsigned long rtx : 11; - unsigned long sx : 11; - unsigned long px : 8; - unsigned long bx : 12; - }; - struct { - unsigned long rfx01 : 2; - unsigned long : 9; - unsigned long rsx01 : 2; - unsigned long : 9; - unsigned long rtx01 : 2; - unsigned long : 9; - unsigned long sx01 : 2; - unsigned long : 29; - }; +#define GMAP_SHADOW_FAKE_TABLE 1ULL + +union dat_table_entry { + unsigned long val; + union region1_table_entry pgd; + union region2_table_entry p4d; + union region3_table_entry pud; + union segment_table_entry pmd; + union page_table_entry pte; }; +#define WALK_N_ENTRIES 7 +#define LEVEL_MEM -2 +struct pgtwalk { + struct guest_fault raw_entries[WALK_N_ENTRIES]; + gpa_t last_addr; + int level; + bool p; +}; + +static inline struct guest_fault *get_entries(struct pgtwalk *w) +{ + return w->raw_entries - LEVEL_MEM; +} + /* * raddress union which will contain the result (real or absolute address) * after a page table walk. The rfaa, sfaa and pfra members are used to @@ -106,16 +109,33 @@ struct aste { /* .. more fields there */ }; +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + int ipte_lock_held(struct kvm *kvm) { - if (sclp.has_siif) { - int rc; + if (sclp.has_siif) + return kvm->arch.sca->ipte_control.kh != 0; - read_lock(&kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(kvm)->kh != 0; - read_unlock(&kvm->arch.sca_lock); - return rc; - } return kvm->arch.ipte_lock_count != 0; } @@ -128,19 +148,16 @@ static void ipte_lock_simple(struct kvm *kvm) if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.k) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); out: mutex_unlock(&kvm->arch.ipte_mutex); } @@ -153,14 +170,12 @@ static void ipte_unlock_simple(struct kvm *kvm) kvm->arch.ipte_lock_count--; if (kvm->arch.ipte_lock_count) goto out; - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); wake_up(&kvm->arch.ipte_wq); out: mutex_unlock(&kvm->arch.ipte_mutex); @@ -171,12 +186,10 @@ static void ipte_lock_siif(struct kvm *kvm) union ipte_control old, new, *ic; retry: - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { if (old.kg) { - read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -184,15 +197,13 @@ retry: new.k = 1; new.kh++; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(kvm); + ic = &kvm->arch.sca->ipte_control; old = READ_ONCE(*ic); do { new = old; @@ -200,7 +211,6 @@ static void ipte_unlock_siif(struct kvm *kvm) if (!new.kh) new.k = 0; } while (!try_cmpxchg(&ic->val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); if (!new.kh) wake_up(&kvm->arch.ipte_wq); } @@ -318,7 +328,7 @@ enum prot_type { PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ - PROT_NONE, + PROT_TYPE_DUMMY, }; static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, @@ -334,7 +344,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { - case PROT_NONE: + case PROT_TYPE_DUMMY: /* We should never get here, acts like termination */ WARN_ON_ONCE(1); break; @@ -437,7 +447,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) } /** - * guest_translate - translate a guest virtual into a guest absolute address + * guest_translate_gva() - translate a guest virtual into a guest absolute address * @vcpu: virtual cpu * @gva: guest virtual address * @gpa: points to where guest physical (absolute) address should be stored @@ -457,9 +467,9 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * the returned value is the program interruption code as defined * by the architecture */ -static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, - unsigned long *gpa, const union asce asce, - enum gacc_mode mode, enum prot_type *prot) +static unsigned long guest_translate_gva(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa, const union asce asce, + enum gacc_mode mode, enum prot_type *prot) { union vaddress vaddr = {.addr = gva}; union raddress raddr = {.addr = gva}; @@ -640,31 +650,19 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int vm_check_access_key(struct kvm *kvm, u8 access_key, - enum gacc_mode mode, gpa_t gpa) +static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) { - u8 storage_key, access_control; - bool fetch_protected; - unsigned long hva; + union skey storage_key; int r; - if (access_key == 0) - return 0; - - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &kvm->mmu_lock) + r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); - if (access_control == access_key) + if (access_key == 0 || storage_key.acc == access_key) return 0; - fetch_protected = storage_key & _PAGE_FP_BIT; - if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp) return 0; return PGM_PROTECTION; } @@ -703,12 +701,11 @@ static bool storage_prot_override_applies(u8 access_control) return access_control == PAGE_SPO_ACC; } -static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, - enum gacc_mode mode, union asce asce, gpa_t gpa, - unsigned long ga, unsigned int len) +static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) { - u8 storage_key, access_control; - unsigned long hva; + union skey storage_key; int r; /* access key 0 matches any storage key -> allow */ @@ -718,26 +715,23 @@ static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, * caller needs to ensure that gfn is accessible, so we can * assume that this cannot fail */ - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); /* access key matches storage key -> allow */ - if (access_control == access_key) + if (storage_key.acc == access_key) return 0; if (mode == GACC_FETCH || mode == GACC_IFETCH) { /* it is a fetch and fetch protection is off -> allow */ - if (!(storage_key & _PAGE_FP_BIT)) + if (!storage_key.fp) return 0; if (fetch_prot_override_applicable(vcpu, mode, asce) && fetch_prot_override_applies(ga, len)) return 0; } if (storage_prot_override_applicable(vcpu) && - storage_prot_override_applies(access_control)) + storage_prot_override_applies(storage_key.acc)) return 0; return PGM_PROTECTION; } @@ -797,20 +791,19 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); + rc = guest_translate_gva(vcpu, ga, &gpa, asce, mode, &prot); if (rc < 0) return rc; } else { gpa = kvm_s390_real_to_abs(vcpu, ga); if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; } } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); - rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, - fragment_len); + rc = vcpu_check_access_key_gpa(vcpu, access_key, mode, asce, gpa, ga, fragment_len); if (rc) return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); if (gpas) @@ -822,8 +815,8 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return 0; } -static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len) +static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) { const unsigned int offset = offset_in_page(gpa); const gfn_t gfn = gpa_to_gfn(gpa); @@ -838,38 +831,79 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, return rc; } -static int -access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len, u8 access_key) +static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key) { - struct kvm_memory_slot *slot; - bool writable; - gfn_t gfn; - hva_t hva; - int rc; + union oac spec = { + .oac1.key = dst_key, + .oac1.k = !!dst_key, + .oac2.key = src_key, + .oac2.k = !!src_key, + }; + int exception = PGM_PROTECTION; + + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: lhi %[exc],0\n" + "2:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception) + : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) + : "memory", "cc", "0"); + return exception; +} - gfn = gpa >> PAGE_SHIFT; - slot = gfn_to_memslot(kvm, gfn); - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); +struct acc_page_key_context { + void *data; + int exception; + unsigned short offset; + unsigned short len; + bool store; + u8 access_key; +}; - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a ro memslot, even tho that can't occur (they're unsupported). - * Don't try to actually handle that case. - */ - if (!writable && mode == GACC_STORE) - return -EOPNOTSUPP; - hva += offset_in_page(gpa); - if (mode == GACC_STORE) - rc = copy_to_user_key((void __user *)hva, data, len, access_key); +static void _access_guest_page_with_key_gpa(struct guest_fault *f) +{ + struct acc_page_key_context *context = f->priv; + void *ptr; + int r; + + ptr = __va(PFN_PHYS(f->pfn) | context->offset); + + if (context->store) + r = mvcos_key(ptr, context->data, context->len, context->access_key, 0); else - rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + r = mvcos_key(context->data, ptr, context->len, 0, context->access_key); + + context->exception = r; +} + +static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 acc) +{ + struct acc_page_key_context context = { + .offset = offset_in_page(gpa), + .len = len, + .data = data, + .access_key = acc, + .store = mode == GACC_STORE, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = mode == GACC_STORE, + .callback = _access_guest_page_with_key_gpa, + }; + int rc; + + if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm)) + return -EINVAL; + + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); if (rc) - return PGM_PROTECTION; - if (mode == GACC_STORE) - mark_page_dirty_in_slot(kvm, slot, gfn); - return 0; + return rc; + return context.exception; } int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, @@ -881,7 +915,7 @@ int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, while (min(PAGE_SIZE - offset, len) > 0) { fragment_len = min(PAGE_SIZE - offset, len); - rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(kvm, mode, gpa, data, fragment_len, access_key); if (rc) return rc; offset = 0; @@ -941,15 +975,14 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, for (idx = 0; idx < nr_pages; idx++) { fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { - rc = access_guest_page(vcpu->kvm, mode, gpas[idx], - data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpas[idx], data, fragment_len); } else { - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); } if (rc == PGM_PROTECTION && try_storage_prot_override) - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, PAGE_SPO_ACC); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); if (rc) break; len -= fragment_len; @@ -962,7 +995,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc == PGM_PROTECTION) prot = PROT_TYPE_KEYC; else - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } out_unlock: @@ -983,7 +1016,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, while (len && !rc) { gpa = kvm_s390_real_to_abs(vcpu, gra); fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); - rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpa, data, fragment_len); len -= fragment_len; gra += fragment_len; data += fragment_len; @@ -994,17 +1027,101 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, } /** + * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys. + * @ptr: Address of value to compare to *@old and exchange with + * @new. Must be aligned to @size. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written here. + * @new: New value to place at *@ptr. + * @size: Size of the operation in bytes, may only be a power of two up to 16. + * @access_key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on guest memory, honoring storage key protection. + * @access_key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * In case of an exception *@uval is set to zero. + * + * Return: + * * %0: cmpxchg executed successfully + * * %1: cmpxchg executed unsuccessfully + * * %PGM_PROTECTION: an exception happened when trying to access *@ptr + * * %-EAGAIN: maxed out number of retries (byte and short only) + * * %-EINVAL: invalid value for @size + */ +static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old, + union kvm_s390_quad new, int size, u8 access_key) +{ + union kvm_s390_quad tmp = { .sixteen = 0 }; + int rc; + + /* + * The cmpxchg_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (size) { + case 1: + rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key); + break; + case 2: + rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key); + break; + case 4: + rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key); + break; + case 8: + rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key); + break; + case 16: + rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen, + access_key); + break; + default: + return -EINVAL; + } + if (!rc && memcmp(&tmp, old, size)) + rc = 1; + *old = tmp; + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (rc == -EFAULT) + rc = PGM_PROTECTION; + return rc; +} + +struct cmpxchg_key_context { + union kvm_s390_quad new; + union kvm_s390_quad *old; + int exception; + unsigned short offset; + u8 access_key; + u8 len; +}; + +static void _cmpxchg_guest_abs_with_key(struct guest_fault *f) +{ + struct cmpxchg_key_context *context = f->priv; + + context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset), + context->old, context->new, context->len, + context->access_key); +} + +/** * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. * @kvm: Virtual machine instance. * @gpa: Absolute guest address of the location to be changed. * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a * non power of two will result in failure. - * @old_addr: Pointer to old value. If the location at @gpa contains this value, - * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() - * *@old_addr contains the value at @gpa before the attempt to - * exchange the value. + * @old: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old contains the value at @gpa before the attempt to + * exchange the value. * @new: The value to place at @gpa. - * @access_key: The access key to use for the guest access. + * @acc: The access key to use for the guest access. * @success: output value indicating if an exchange occurred. * * Atomically exchange the value at @gpa by @new, if it contains *@old. @@ -1017,89 +1134,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * * -EAGAIN: transient failure (len 1 or 2) * * -EOPNOTSUPP: read-only memslot (should never occur) */ -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, - __uint128_t *old_addr, __uint128_t new, - u8 access_key, bool *success) +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, + union kvm_s390_quad new, u8 acc, bool *success) { - gfn_t gfn = gpa_to_gfn(gpa); - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - bool writable; - hva_t hva; - int ret; - - if (!IS_ALIGNED(gpa, len)) - return -EINVAL; - - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a read-only memslot, even though that cannot occur - * since those are unsupported. - * Don't try to actually handle that case. - */ - if (!writable) - return -EOPNOTSUPP; - - hva += offset_in_page(gpa); - /* - * The cmpxchg_user_key macro depends on the type of "old", so we need - * a case for each valid length and get some code duplication as long - * as we don't introduce a new macro. - */ - switch (len) { - case 1: { - u8 old; - - ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 2: { - u16 old; - - ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 4: { - u32 old; - - ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 8: { - u64 old; + struct cmpxchg_key_context context = { + .old = old, + .new = new, + .offset = offset_in_page(gpa), + .len = len, + .access_key = acc, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = true, + .callback = _cmpxchg_guest_abs_with_key, + }; + int rc; - ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 16: { - __uint128_t old; + lockdep_assert_held(&kvm->srcu); - ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - default: + if (len > 16 || !IS_ALIGNED(gpa, len)) return -EINVAL; - } - if (*success) - mark_page_dirty_in_slot(kvm, slot, gfn); - /* - * Assume that the fault is caused by protection, either key protection - * or user page write protection. - */ - if (ret == -EFAULT) - ret = PGM_PROTECTION; - return ret; + + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); + if (rc) + return rc; + *success = !context.exception; + if (context.exception == 1) + return 0; + return context.exception; } /** @@ -1174,7 +1238,7 @@ int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, while (length && !rc) { fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); - rc = vm_check_access_key(kvm, access_key, mode, gpa); + rc = vm_check_access_key_gpa(kvm, access_key, mode, gpa); length -= fragment_len; gpa += fragment_len; } @@ -1201,304 +1265,409 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) } /** - * kvm_s390_shadow_tables - walk the guest page table and create shadow tables - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the beginning of the page table for the given address if - * successful (return value 0), or to the first invalid DAT entry in - * case of exceptions (return value > 0) - * @dat_protection: referenced memory is write protected - * @fake: pgt references contiguous guest memory block, not a pgtable + * walk_guest_tables() - Walk the guest page table and pin the dat tables. + * @sg: Pointer to the shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @w: Will be filled with information on the pinned pages. + * @wr: Wndicates a write access if true. + * + * Return: + * * %0 in case of success, + * * a PIC code > 0 in case the address translation fails + * * an error code < 0 if other errors happen in the host */ -static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr) { - struct kvm *kvm; - struct gmap *parent; - union asce asce; + struct gmap *parent = sg->parent; + struct guest_fault *entries; + union dat_table_entry table; union vaddress vaddr; unsigned long ptr; + struct kvm *kvm; + union asce asce; int rc; - *fake = 0; - *dat_protection = 0; - kvm = sg->private; - parent = sg->parent; + if (!parent) + return -EAGAIN; + kvm = parent->kvm; + WARN_ON(!kvm); + asce = sg->guest_asce; + entries = get_entries(w); + + w->level = LEVEL_MEM; + w->last_addr = saddr; + if (asce.r) + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false); + vaddr.addr = saddr; - asce.val = sg->orig_asce; ptr = asce.rsto * PAGE_SIZE; - if (asce.r) { - *fake = 1; - ptr = 0; - asce.dt = ASCE_TYPE_REGION1; - } + + if (!asce_contains_gfn(asce, gpa_to_gfn(saddr))) + return PGM_ASCE_TYPE; switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl && !*fake) + if (vaddr.rfx01 > asce.tl) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: - if (vaddr.rfx) - return PGM_ASCE_TYPE; if (vaddr.rsx01 > asce.tl) return PGM_REGION_SECOND_TRANS; break; case ASCE_TYPE_REGION3: - if (vaddr.rfx || vaddr.rsx) - return PGM_ASCE_TYPE; if (vaddr.rtx01 > asce.tl) return PGM_REGION_THIRD_TRANS; break; case ASCE_TYPE_SEGMENT: - if (vaddr.rfx || vaddr.rsx || vaddr.rtx) - return PGM_ASCE_TYPE; if (vaddr.sx01 > asce.tl) return PGM_SEGMENT_TRANSLATION; break; } + w->level = asce.dt; switch (asce.dt) { - case ASCE_TYPE_REGION1: { - union region1_table_entry rfte; - - if (*fake) { - ptr += vaddr.rfx * _REGION1_SIZE; - rfte.val = ptr; - goto shadow_r2t; - } - *pgt = ptr + vaddr.rfx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + case ASCE_TYPE_REGION1: + w->last_addr = ptr + vaddr.rfx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rfte.i) + if (table.pgd.i) return PGM_REGION_FIRST_TRANS; - if (rfte.tt != TABLE_TYPE_REGION1) + if (table.pgd.tt != TABLE_TYPE_REGION1) return PGM_TRANSLATION_SPEC; - if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl) return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rfte.p; - ptr = rfte.rto * PAGE_SIZE; -shadow_r2t: - rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r1_entry++; - } + w->p |= table.pgd.p; + ptr = table.pgd.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION2: { - union region2_table_entry rste; - - if (*fake) { - ptr += vaddr.rsx * _REGION2_SIZE; - rste.val = ptr; - goto shadow_r3t; - } - *pgt = ptr + vaddr.rsx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + case ASCE_TYPE_REGION2: + w->last_addr = ptr + vaddr.rsx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rste.i) + if (table.p4d.i) return PGM_REGION_SECOND_TRANS; - if (rste.tt != TABLE_TYPE_REGION2) + if (table.p4d.tt != TABLE_TYPE_REGION2) return PGM_TRANSLATION_SPEC; - if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl) return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rste.p; - ptr = rste.rto * PAGE_SIZE; -shadow_r3t: - rste.p |= *dat_protection; - rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r2_entry++; - } + w->p |= table.p4d.p; + ptr = table.p4d.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION3: { - union region3_table_entry rtte; - - if (*fake) { - ptr += vaddr.rtx * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; - } - *pgt = ptr + vaddr.rtx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + case ASCE_TYPE_REGION3: + w->last_addr = ptr + vaddr.rtx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rtte.i) + if (table.pud.i) return PGM_REGION_THIRD_TRANS; - if (rtte.tt != TABLE_TYPE_REGION3) + if (table.pud.tt != TABLE_TYPE_REGION3) return PGM_TRANSLATION_SPEC; - if (rtte.cr && asce.p && sg->edat_level >= 2) + if (table.pud.cr && asce.p && sg->edat_level >= 2) return PGM_TRANSLATION_SPEC; - if (rtte.fc && sg->edat_level >= 2) { - *dat_protection |= rtte.fc0.p; - *fake = 1; - ptr = rtte.fc1.rfaa * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; + if (sg->edat_level >= 1) + w->p |= table.pud.p; + if (table.pud.fc && sg->edat_level >= 2) { + table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK); + goto edat_applies; } - if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl) return PGM_SEGMENT_TRANSLATION; - if (sg->edat_level >= 1) - *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * PAGE_SIZE; -shadow_sgt: - rtte.fc0.p |= *dat_protection; - rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r3_entry++; - } + ptr = table.pud.fc0.sto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_SEGMENT: { - union segment_table_entry ste; - - if (*fake) { - ptr += vaddr.sx * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; - } - *pgt = ptr + vaddr.sx * 8; - rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + case ASCE_TYPE_SEGMENT: + w->last_addr = ptr + vaddr.sx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (ste.i) + if (table.pmd.i) return PGM_SEGMENT_TRANSLATION; - if (ste.tt != TABLE_TYPE_SEGMENT) + if (table.pmd.tt != TABLE_TYPE_SEGMENT) return PGM_TRANSLATION_SPEC; - if (ste.cs && asce.p) + if (table.pmd.cs && asce.p) return PGM_TRANSLATION_SPEC; - *dat_protection |= ste.fc0.p; - if (ste.fc && sg->edat_level >= 1) { - *fake = 1; - ptr = ste.fc1.sfaa * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; + w->p |= table.pmd.p; + if (table.pmd.fc && sg->edat_level >= 1) { + table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK); + goto edat_applies; } - ptr = ste.fc0.pto * (PAGE_SIZE / 2); -shadow_pgt: - ste.fc0.p |= *dat_protection; - rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2); + w->level--; + } + w->last_addr = ptr + vaddr.px * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); + if (rc) + return rc; + if (table.pte.i) + return PGM_PAGE_TRANSLATION; + if (table.pte.z) + return PGM_TRANSLATION_SPEC; + w->p |= table.pte.p; +edat_applies: + if (wr && w->p) + return PGM_PROTECTION; + + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr); +} + +static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep, + struct guest_fault *f, bool p) +{ + union pgste pgste; + union pte newpte; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE); + if (rc) + return rc; + + if (!pgste_get_trylock(ptep_h, &pgste)) + return -EAGAIN; + newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s); + newpte.s.d |= ptep_h->s.d; + newpte.s.sd |= ptep_h->s.sd; + newpte.h.p &= ptep_h->h.p; + if (!newpte.h.p && !f->writable) { + rc = -EOPNOTSUPP; + } else { + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); + pgste.vsie_notif = 1; + } + pgste_set_unlock(ptep_h, pgste); + if (rc) + return rc; + if (sg->invalidated) + return -EAGAIN; + + newpte = _pte(f->pfn, 0, !p, 0); + if (!pgste_get_trylock(ptep, &pgste)) + return -EAGAIN; + pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg)); + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, + struct guest_fault *f, bool p) +{ + union crste newcrste, oldcrste; + gfn_t gfn; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK); + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + if (rc) + return rc; + + do { + /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */ + if (sg->invalidated) + return -EAGAIN; + oldcrste = READ_ONCE(*host); + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p); + newcrste.s.fc1.d |= oldcrste.s.fc1.d; + newcrste.s.fc1.sd |= oldcrste.s.fc1.sd; + newcrste.h.p &= oldcrste.h.p; + newcrste.s.fc1.vsie_notif = 1; + newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif; + newcrste.s.fc1.s = oldcrste.s.fc1.s; + if (!newcrste.h.p && !f->writable) + return -EOPNOTSUPP; + } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false)); + if (sg->invalidated) + return -EAGAIN; + + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); + gfn = gpa_to_gfn(raddr); + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + ; + return 0; +} + +static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + unsigned long saddr, struct pgtwalk *w) +{ + struct guest_fault *entries; + int flags, i, hl, gl, l, rc; + union crste *table, *host; + union pte *ptep, *ptep_h; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + entries = get_entries(w); + ptep_h = NULL; + ptep = NULL; + + rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, + &table, &ptep); + if (rc) + return rc; + + /* A race occurred. The shadow mapping is already valid, nothing to do */ + if ((ptep && !ptep->h.i && ptep->h.p == w->p) || + (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p)) + return 0; + + gl = get_level(table, ptep); + + /* In case of a real address space */ + if (w->level <= LEVEL_MEM) { + l = TABLE_TYPE_PAGE_TABLE; + hl = TABLE_TYPE_REGION1; + goto real_address_space; + } + + /* + * Skip levels that are already protected. For each level, protect + * only the page containing the entry, not the whole table. + */ + for (i = gl ; i >= w->level; i--) { + rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr), + entries[i].pfn, i + 1, entries[i].writable); if (rc) return rc; - kvm->stat.gmap_shadow_sg_entry++; + if (sg->invalidated) + return -EAGAIN; } + + rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, + TABLE_TYPE_PAGE_TABLE, &host, &ptep_h); + if (rc) + return rc; + + hl = get_level(host, ptep_h); + /* Get the smallest granularity */ + l = min3(gl, hl, w->level); + +real_address_space: + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + /* If necessary, create the shadow mapping */ + if (l < gl) { + rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep); + if (rc) + return rc; } - /* Return the parent address of the page table */ - *pgt = ptr; - return 0; + if (l < hl) { + rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce, + flags, l, &host, &ptep_h); + if (rc) + return rc; + } + + if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm)) + return -EFAULT; + if (l == TABLE_TYPE_PAGE_TABLE) + return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p); + return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p); } -/** - * shadow_pgt_lookup() - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, - int *dat_protection, int *fake) +static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + unsigned long seq, struct pgtwalk *walk) { - unsigned long pt_index; - unsigned long *table; - struct page *page; + struct gmap *parent; int rc; - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); - *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; + if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; +again: + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; + sg->invalidated = false; + rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk); + } + if (rc == -ENOMEM) + goto again; + if (!rc) + kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false); } - spin_unlock(&sg->guest_table_lock); return rc; } /** - * kvm_s390_shadow_fault - handle fault on a shadow page table - * @vcpu: virtual cpu - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @datptr: will contain the address of the faulting DAT table entry, or of - * the valid leaf, plus some flags + * __gaccess_shadow_fault() - Handle fault on a shadow page table. + * @vcpu: Virtual cpu that triggered the action. + * @sg: The shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @datptr: Will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags. + * @wr: Whether this is a write access. * - * Returns: - 0 if the shadow fault was successfully resolved - * - > 0 (pgm exception code) on exceptions while faulting - * - -EAGAIN if the caller can retry immediately - * - -EFAULT when accessing invalid guest addresses - * - -ENOMEM if out of memory + * Return: + * * %0 if the shadow fault was successfully resolved + * * > 0 (pgm exception code) on exceptions while faulting + * * %-EAGAIN if the caller can retry immediately + * * %-EFAULT when accessing invalid guest addresses + * * %-ENOMEM if out of memory */ -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr, unsigned long *datptr) +static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) { - union vaddress vaddr; - union page_table_entry pte; - unsigned long pgt = 0; - int dat_protection, fake; + struct pgtwalk walk = { .p = false, }; + unsigned long seq; int rc; - if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) - return -EFAULT; - - mmap_read_lock(sg->mm); - /* - * We don't want any guest-2 tables to change - so the parent - * tables/pointers we read stay valid - unshadowing is however - * always possible - only guest_table_lock protects us. - */ - ipte_lock(vcpu->kvm); + seq = vcpu->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); - rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = walk_guest_tables(sg, saddr, &walk, wr); + if (datptr) { + datptr->val = walk.last_addr; + datptr->dat_prot = wr && walk.p; + datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE; + datptr->real = sg->guest_asce.r; + } + if (!rc) + rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk); if (rc) - rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, - &fake); + kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true); + return rc; +} - vaddr.addr = saddr; - if (fake) { - pte.val = pgt + vaddr.px * PAGE_SIZE; - goto shadow_page; - } +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) +{ + int rc; - switch (rc) { - case PGM_SEGMENT_TRANSLATION: - case PGM_REGION_THIRD_TRANS: - case PGM_REGION_SECOND_TRANS: - case PGM_REGION_FIRST_TRANS: - pgt |= PEI_NOT_PTE; - break; - case 0: - pgt += vaddr.px * 8; - rc = gmap_read_table(sg->parent, pgt, &pte.val); - } - if (datptr) - *datptr = pgt | dat_protection * PEI_DAT_PROT; - if (!rc && pte.i) - rc = PGM_PAGE_TRANSLATION; - if (!rc && pte.z) - rc = PGM_TRANSLATION_SPEC; -shadow_page: - pte.p |= dat_protection; - if (!rc) - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - vcpu->kvm->stat.gmap_shadow_pg_entry++; + if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm)) + return -EFAULT; + + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + + ipte_lock(vcpu->kvm); + rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r); ipte_unlock(vcpu->kvm); - mmap_read_unlock(sg->mm); + return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 3fde45a151f2..b5385cec60f4 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, - __uint128_t new, u8 access_key, bool *success); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, + union kvm_s390_quad new, u8 access_key, bool *success); /** * write_guest_with_key - copy data from kernel space to guest space @@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm); int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); -/* MVPG PEI indication bits */ -#define PEI_DAT_PROT 2 -#define PEI_NOT_PTE 4 +union mvpg_pei { + unsigned long val; + struct { + unsigned long addr : 61; + unsigned long not_pte : 1; + unsigned long dat_prot: 1; + unsigned long real : 1; + }; +}; -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr, unsigned long *datptr); +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c deleted file mode 100644 index a6d1dbb04c97..000000000000 --- a/arch/s390/kvm/gmap-vsie.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Guest memory management for KVM/s390 nested VMs. - * - * Copyright IBM Corp. 2008, 2020, 2024 - * - * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> - * Martin Schwidefsky <schwidefsky@de.ibm.com> - * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> - */ - -#include <linux/compiler.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/pgtable.h> -#include <linux/pagemap.h> -#include <linux/mman.h> - -#include <asm/lowcore.h> -#include <asm/gmap.h> -#include <asm/uv.h> - -#include "kvm-s390.h" -#include "gmap.h" - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - * - * Context: Called with parent->shadow_lock held - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg; - - lockdep_assert_held(&parent->shadow_lock); - list_for_each_entry(sg, &parent->children, list) { - if (!gmap_shadow_valid(sg, asce, edat_level)) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || - KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) - return ERR_PTR(-EFAULT); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->private = parent->private; - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1), - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 6d8944d1b4a0..3c26e35af0ef 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -7,7 +7,7 @@ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> + * Janosch Frank <frankja@linux.ibm.com> */ #include <linux/compiler.h> @@ -15,107 +15,1321 @@ #include <linux/kvm_host.h> #include <linux/pgtable.h> #include <linux/pagemap.h> - #include <asm/lowcore.h> -#include <asm/gmap.h> #include <asm/uv.h> +#include <asm/gmap_helpers.h> +#include "dat.h" #include "gmap.h" +#include "kvm-s390.h" +#include "faultin.h" + +static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; +} + +static int gmap_limit_to_type(gfn_t limit) +{ + if (!limit) + return TABLE_TYPE_REGION1; + if (limit <= _REGION3_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_SEGMENT; + if (limit <= _REGION2_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION3; + if (limit <= _REGION1_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION2; + return TABLE_TYPE_REGION1; +} + +/** + * gmap_new() - Allocate and initialize a guest address space. + * @kvm: The kvm owning the guest. + * @limit: Maximum address of the gmap address space. + * + * Return: A guest address space structure. + */ +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) +{ + struct crst_table *table; + struct gmap *gmap; + int type; + + type = gmap_limit_to_type(limit); + + gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT); + if (!gmap) + return NULL; + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->list); + INIT_LIST_HEAD(&gmap->scb_users); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); + spin_lock_init(&gmap->children_lock); + spin_lock_init(&gmap->host_to_rmap_lock); + refcount_set(&gmap->refcount, 1); + + table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); + if (!table) { + kfree(gmap); + return NULL; + } + + gmap->asce.val = __pa(table); + gmap->asce.dt = type; + gmap->asce.tl = _ASCE_TABLE_LENGTH; + gmap->asce.x = 1; + gmap->asce.p = 1; + gmap->asce.s = 1; + gmap->kvm = kvm; + set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); + + return gmap; +} + +static void gmap_add_child(struct gmap *parent, struct gmap *child) +{ + KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); + KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); + KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); + lockdep_assert_held(&parent->children_lock); + + child->parent = parent; + + if (is_ucontrol(parent)) + set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + else + clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + + if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + else + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + + if (kvm_is_ucontrol(parent->kvm)) + clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); + list_add(&child->list, &parent->children); +} + +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) +{ + struct gmap *res; + + lockdep_assert_not_held(&parent->children_lock); + res = gmap_new(parent->kvm, limit); + if (res) { + scoped_guard(spinlock, &parent->children_lock) + gmap_add_child(parent, res); + } + return res; +} + +int gmap_set_limit(struct gmap *gmap, gfn_t limit) +{ + struct kvm_s390_mmu_cache *mc; + int rc, type; + + type = gmap_limit_to_type(limit); + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = dat_set_asce_limit(mc, &gmap->asce, type); + } while (rc == -ENOMEM); + + kvm_s390_free_mmu_cache(mc); + return 0; +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +void gmap_remove_child(struct gmap *child) +{ + if (KVM_BUG_ON(!child->parent, child->kvm)) + return; + lockdep_assert_held(&child->parent->children_lock); + + list_del(&child->list); + child->parent = NULL; + child->invalidated = true; +} + +/** + * gmap_dispose() - Remove and free a guest address space and its children. + * @gmap: Pointer to the guest address space structure. + */ +void gmap_dispose(struct gmap *gmap) +{ + /* The gmap must have been removed from the parent beforehands */ + KVM_BUG_ON(gmap->parent, gmap->kvm); + /* All children of this gmap must have been removed beforehands */ + KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); + /* No VSIE shadow block is allowed to use this gmap */ + KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); + /* The ASCE must be valid */ + KVM_BUG_ON(!gmap->asce.val, gmap->kvm); + /* The refcount must be 0 */ + KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); + + /* Flush tlb of all gmaps */ + asce_flush_tlb(gmap->asce); + + /* Free all DAT tables. */ + dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); + + /* Free additional data for a shadow gmap */ + if (is_shadow(gmap)) + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + + kfree(gmap); +} + +/** + * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. + * @gmap: The gmap whose ASCE needs to be replaced. + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + * + * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, + * -ENOMEM if runinng out of memory. + */ +int s390_replace_asce(struct gmap *gmap) +{ + struct crst_table *table; + union asce asce; + + /* Replacing segment type ASCEs would cause serious issues */ + if (gmap->asce.dt == ASCE_TYPE_SEGMENT) + return -EINVAL; + + table = dat_alloc_crst_sleepable(0); + if (!table) + return -ENOMEM; + memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = gmap->asce; + asce.rsto = virt_to_pfn(table); + WRITE_ONCE(gmap->asce, asce); + + return 0; +} + +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) +{ + struct kvm *kvm = gmap->kvm; + struct kvm_vcpu *vcpu; + gfn_t prefix_gfn; + unsigned long i; + + if (is_shadow(gmap)) + return false; + kvm_for_each_vcpu(i, vcpu, kvm) { + /* Match against both prefix pages */ + prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); + if (prefix_gfn < end && gfn <= prefix_gfn + 1) { + if (hint && kvm_s390_is_in_sie(vcpu)) + return false; + VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", + gfn_to_gpa(gfn), gfn_to_gpa(end)); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); + } + } + return true; +} + +struct clear_young_pte_priv { + struct gmap *gmap; + bool young; +}; + +static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *p = walk->priv; + union pgste pgste; + union pte pte, new; + + pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (!pte.s.y && pte.h.i)) + return 0; + + pgste = pgste_get_lock(ptep); + if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { + new = pte; + new.h.i = 1; + new.s.y = 0; + if ((new.s.d || !new.h.p) && !new.s.s) + folio_set_dirty(pfn_folio(pte.h.pfra)); + new.s.d = 0; + new.h.p = 1; + + pgste.prefix_notif = 0; + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); + } + p->young = 1; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *priv = walk->priv; + union crste crste, new; + + do { + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) + break; + + new = crste; + new.h.i = 1; + new.s.fc1.y = 0; + new.s.fc1.prefix_notif = 0; + if (new.s.fc1.d || !new.h.p) + folio_set_dirty(phys_to_folio(crste_origin_large(crste))); + new.s.fc1.d = 0; + new.h.p = 1; + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); + + priv->young = 1; + return 0; +} /** - * gmap_make_secure() - make one guest page secure - * @gmap: the guest gmap - * @gaddr: the guest address that needs to be made secure - * @uvcb: the UVCB specifying which operation needs to be performed + * gmap_age_gfn() - Clear young. + * @gmap: The guest gmap. + * @start: The first gfn to test. + * @end: The gfn after the last one to test. * - * Context: needs to be called with kvm->srcu held. - * Return: 0 on success, < 0 in case of error. + * Context: Called with the kvm mmu write lock held. + * Return: 1 if any page in the given range was young, otherwise 0. */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) { - struct kvm *kvm = gmap->private; + const struct dat_walk_ops ops = { + .pte_entry = gmap_clear_young_pte, + .pmd_entry = gmap_clear_young_crste, + .pud_entry = gmap_clear_young_crste, + }; + struct clear_young_pte_priv priv = { + .gmap = gmap, + .young = false, + }; + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + + return priv.young; +} + +struct gmap_unmap_priv { + struct gmap *gmap; + struct kvm_memory_slot *slot; +}; + +static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) +{ + struct gmap_unmap_priv *priv = w->priv; + struct folio *folio = NULL; unsigned long vmaddr; + union pgste pgste; - lockdep_assert_held(&kvm->srcu); + pgste = pgste_get_lock(ptep); + if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { + vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); + gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); + } + if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = pfn_folio(ptep->h.pfra); + pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); + pgste_set_unlock(ptep, pgste); + if (folio) + uv_convert_from_secure_folio(folio); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return -EFAULT; - return make_hva_secure(gmap->mm, vmaddr, uvcb); + return 0; } -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, + struct gmap_unmap_priv *priv = walk->priv; + struct folio *folio = NULL; + union crste old = *crstep; + + if (!old.h.fc) + return 0; + + if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = phys_to_folio(crste_origin_large(old)); + /* No races should happen because kvm->mmu_lock is held in write mode */ + KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), + priv->gmap->kvm); + if (folio) + uv_convert_from_secure_folio(folio); + + return 0; +} + +/** + * gmap_unmap_gfn_range() - Unmap a range of guest addresses. + * @gmap: The gmap to act on. + * @slot: The memslot in which the range is located. + * @start: The first gfn to unmap. + * @end: The gfn after the last one to unmap. + * + * Context: Called with the kvm mmu write lock held. + * Return: false + */ +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops ops = { + .pte_entry = _gmap_unmap_pte, + .pmd_entry = _gmap_unmap_crste, + .pud_entry = _gmap_unmap_crste, }; + struct gmap_unmap_priv priv = { + .gmap = gmap, + .slot = slot, + }; + + lockdep_assert_held_write(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + return false; +} + +static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, + struct gmap *gmap) +{ + union pte pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (pte.h.p && !pte.s.sd)) + return pgste; + + /* + * If this page contains one or more prefixes of vCPUS that are currently + * running, do not reset the protection, leave it marked as dirty. + */ + if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { + pte.h.p = 1; + pte.s.sd = 0; + pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); + } + + mark_page_dirty(gmap->kvm, gfn); + + return pgste; +} + +static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, new; + + if (fatal_signal_pending(current)) + return 1; + do { + crste = READ_ONCE(*table); + if (!crste.h.fc) + return 0; + if (crste.h.p && !crste.s.fc1.sd) + return 0; + + /* + * If this large page contains one or more prefixes of vCPUs that are + * currently running, do not reset the protection, leave it marked as + * dirty. + */ + if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) + break; + new = crste; + new.h.p = 1; + new.s.fc1.sd = 0; + } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); + + for ( ; gfn < end; gfn++) + mark_page_dirty(gmap->kvm, gfn); - return gmap_make_secure(gmap, gaddr, &uvcb); + return 0; +} + +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops walk_ops = { + .pte_entry = _pte_test_and_clear_softdirty, + .pmd_entry = _crste_test_and_clear_softdirty, + .pud_entry = _crste_test_and_clear_softdirty, + }; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); +} + +static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) +{ + union crste newcrste, oldcrste = READ_ONCE(*f->crstep); + + /* Somehow the crste is not large anymore, let the slow path deal with it. */ + if (!oldcrste.h.fc) + return 1; + + f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); + f->writable = oldcrste.s.fc1.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) + return 0; + + if (!f->write_attempt || oldcrste.s.fc1.w) { + f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; + newcrste = oldcrste; + newcrste.h.i = 0; + newcrste.s.fc1.y = 1; + if (f->write_attempt) { + newcrste.h.p = 0; + newcrste.s.fc1.d = 1; + newcrste.s.fc1.sd = 1; + } + /* In case of races, let the slow path deal with it. */ + return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); + } + /* Trying to write on a read-only page, let the slow path deal with it. */ + return 1; +} + +static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, + struct guest_fault *f) +{ + union pte newpte, oldpte = READ_ONCE(*f->ptep); + + f->pfn = oldpte.h.pfra; + f->writable = oldpte.s.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) + return 0; + /* Trying to write on a read-only page, let the slow path deal with it. */ + if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) + return 1; + + newpte = oldpte; + newpte.h.i = 0; + newpte.s.y = 1; + if (f->write_attempt) { + newpte.h.p = 0; + newpte.s.d = 1; + newpte.s.sd = 1; + } + *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); + + return 0; } /** - * __gmap_destroy_page() - Destroy a guest page. - * @gmap: the gmap of the guest - * @page: the page to destroy + * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. + * @gmap: The gmap whose fault needs to be resolved. + * @fault: Describes the fault that is being resolved. * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. + * A minor fault is a fault that can be resolved quickly within gmap. + * The page is already mapped, the fault is only due to dirty/young tracking. * - * Context: must be called holding the mm lock for gmap->mm + * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could + * not be resolved and needs to go through the slow path. */ -static int __gmap_destroy_page(struct gmap *gmap, struct page *page) +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) { - struct folio *folio = page_folio(page); + union pgste pgste; int rc; - /* - * See gmap_make_secure(): large folios cannot be secure. Small - * folio implies FW_LEVEL_PTE. - */ - if (folio_test_large(folio)) - return -EFAULT; + lockdep_assert_held(&gmap->kvm->mmu_lock); - rc = uv_destroy_folio(folio); - /* - * Fault handlers can race; it is possible that two CPUs will fault - * on the same secure page. One CPU can destroy the page, reboot, - * re-enter secure mode and import it, while the second CPU was - * stuck at the beginning of the handler. At some point the second - * CPU will be able to progress, and it will not be able to destroy - * the page. In that case we do not want to terminate the process, - * we instead try to export the page. - */ + rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, + &fault->crstep, &fault->ptep); + /* If a PTE or a leaf CRSTE could not be reached, slow path. */ if (rc) - rc = uv_convert_from_secure_folio(folio); + return 1; + if (fault->ptep) { + pgste = pgste_get_lock(fault->ptep); + rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); + if (!rc && fault->callback) + fault->callback(fault); + pgste_set_unlock(fault->ptep, pgste); + } else { + rc = gmap_handle_minor_crste_fault(gmap, fault); + if (!rc && fault->callback) + fault->callback(fault); + } return rc; } +static inline bool gmap_2g_allowed(struct gmap *gmap, struct guest_fault *f, + struct kvm_memory_slot *slot) +{ + return false; +} + /** - * gmap_destroy_page() - Destroy a guest page. - * @gmap: the gmap of the guest - * @gaddr: the guest address to destroy + * gmap_1m_allowed() - Check whether a 1M hugepage is allowed. + * @gmap: The gmap of the guest. + * @f: Describes the fault that is being resolved. + * @slot: The memslot the faulting address belongs to. * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. + * The function checks whether the GMAP_FLAG_ALLOW_HPAGE_1M flag is set for + * @gmap, whether the offset of the address in the 1M virtual frame is the + * same as the offset in the physical 1M frame, and finally whether the whole + * 1M page would fit in the given memslot. * - * Context: may sleep. + * Return: true if a 1M hugepage is allowed to back the faulting address, false + * otherwise. */ -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +static inline bool gmap_1m_allowed(struct gmap *gmap, struct guest_fault *f, + struct kvm_memory_slot *slot) +{ + return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags) && + !((f->gfn ^ f->pfn) & ~_SEGMENT_FR_MASK) && + slot->base_gfn <= ALIGN_DOWN(f->gfn, _PAGES_PER_SEGMENT) && + slot->base_gfn + slot->npages >= ALIGN(f->gfn + 1, _PAGES_PER_SEGMENT); +} + +static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, + struct guest_fault *f) { - struct page *page; + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; int rc = 0; - mmap_read_lock(gmap->mm); - page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr)); - if (page) - rc = __gmap_destroy_page(gmap, page); - kvm_release_page_clean(page); - mmap_read_unlock(gmap->mm); + rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, + &f->crstep, &f->ptep); + if (rc == -ENOMEM) + return rc; + if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) + return rc; + if (rc) + return -EAGAIN; + if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + do { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.s = !f->page; + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); + if (f->callback) + f->callback(f); + } + return rc; } + +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f, + struct kvm_memory_slot *slot) +{ + unsigned int order; + int level; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + level = TABLE_TYPE_PAGE_TABLE; + if (f->page) { + order = folio_order(page_folio(f->page)); + if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f, slot)) + level = TABLE_TYPE_REGION3; + else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f, slot)) + level = TABLE_TYPE_SEGMENT; + } + return _gmap_link(mc, gmap, level, f); +} + +static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) +{ + union crste newcrste, oldcrste; + struct page_table *pt; + union crste *crstep; + union pte *ptep; + int rc; + + if (force_alloc) + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + else + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, + TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (rc) + return rc; + if (!ptep) { + newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); + newcrste.h.i = 1; + newcrste.h.fc0.tl = 1; + } else { + pt = pte_table_start(ptep); + dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); + newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); + } + rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, + &crstep, &ptep); + if (rc) + return rc; + do { + oldcrste = READ_ONCE(*crstep); + if (oldcrste.val == newcrste.val) + break; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); + return 0; +} + +static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) +{ + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, + TABLE_TYPE_SEGMENT, crstepp, &ptep); + if (rc || (!ptep && !crste_is_ucas(**crstepp))) + return -EREMOTE; + if (!ptep) + return 1; + *gaddr &= ~_SEGMENT_MASK; + *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; + return 0; +} + +/** + * gmap_ucas_translate() - Translate a vcpu address into a host gmap address + * @mc: The memory cache to be used for allocations. + * @gmap: The per-cpu gmap. + * @gaddr: Pointer to the address to be translated, will get overwritten with + * the translated address in case of success. + * Translates the per-vCPU guest address into a fake guest address, which can + * then be used with the fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault resolution path, like + * normal VMs. + * + * Return: %0 in case of success, otherwise %-EREMOTE. + */ +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) +{ + gpa_t translated_address; + union crste *crstep; + gfn_t gfn; + int rc; + + gfn = gpa_to_gfn(*gaddr); + + scoped_guard(read_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + } + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + translated_address = (*gaddr & ~_SEGMENT_MASK) | + (crstep->val & _SEGMENT_MASK); + rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); + } + if (!rc) { + *gaddr = translated_address; + return 0; + } + if (rc != -ENOMEM) + return -EREMOTE; + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } while (1); + return 0; +} + +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) +{ + struct kvm_s390_mmu_cache *mc; + int rc; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + while (count) { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + continue; + } + if (rc) + return rc; + + count--; + c_gfn += _PAGE_ENTRIES; + p_gfn += _PAGE_ENTRIES; + } + return rc; +} + +static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) +{ + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (rc) + return; + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) + ; +} + +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) +{ + guard(read_lock)(&gmap->kvm->mmu_lock); + + for ( ; count; count--, c_gfn += _PAGE_ENTRIES) + gmap_ucas_unmap_one(gmap, c_gfn); +} + +static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, newcrste; + + crste = READ_ONCE(*crstep); + newcrste = _CRSTE_EMPTY(crste.h.tt); + + while (crste_leaf(crste)) { + if (crste_prefix(crste)) + gmap_unmap_prefix(gmap, gfn, next); + if (crste.s.fc1.vsie_notif) + gmap_handle_vsie_unshadow_event(gmap, gfn); + if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) + break; + crste = READ_ONCE(*crstep); + } + + if (need_resched()) + return next; + + return 0; +} + +void gmap_split_huge_pages(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { + .pmd_entry = _gmap_split_crste, + .pud_entry = _gmap_split_crste, + }; + gfn_t start = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, + &ops, DAT_WALK_IGN_HOLES, gmap); + cond_resched(); + } while (start); +} + +static int _gmap_enable_skeys(struct gmap *gmap) +{ + gfn_t start = 0; + int rc; + + if (uses_skeys(gmap)) + return 0; + + set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + rc = gmap_helper_disable_cow_sharing(); + if (rc) { + clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + return rc; + } + + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + start = dat_reset_skeys(gmap->asce, start); + cond_resched(); + } while (start); + return 0; +} + +int gmap_enable_skeys(struct gmap *gmap) +{ + int rc; + + mmap_write_lock(gmap->kvm->mm); + rc = _gmap_enable_skeys(gmap); + mmap_write_unlock(gmap->kvm->mm); + return rc; +} + +static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + if (!ptep->s.pr) + return 0; + __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); + if (need_resched()) + return next; + return 0; +} + +static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t origin, cur, end; + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + origin = crste_origin_large(*crstep); + cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + for ( ; cur < end; cur += PAGE_SIZE) + __kvm_s390_pv_destroy_page(phys_to_page(cur)); + if (need_resched()) + return next; + return 0; +} + +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) +{ + const struct dat_walk_ops ops = { + .pte_entry = _destroy_pages_pte, + .pmd_entry = _destroy_pages_crste, + .pud_entry = _destroy_pages_crste, + }; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } while (start && start < end); + return 0; +} + +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) +{ + struct vsie_rmap *rmap __free(kvfree) = NULL; + struct vsie_rmap *temp; + void __rcu **slot; + int rc = 0; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->host_to_rmap_lock); + + rmap = kzalloc_obj(*rmap, GFP_ATOMIC); + if (!rmap) + return -ENOMEM; + + rmap->r_gfn = r_gfn; + rmap->level = level; + slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->val == rmap->val) + return 0; + } + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); + if (rc) + return rc; + } + rmap = NULL; + + return 0; +} + +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + union pte pte; + int flags, rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->parent->children_lock); + + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + if (level <= TABLE_TYPE_REGION1) { + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); + } + if (rc) + return rc; + + if (!pgste_get_trylock(ptep, &pgste)) + return -EAGAIN; + pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); + pte.h.p = 1; + pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); + pgste.vsie_notif = 1; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); + if (need_resched()) + return next; + return 0; +} + +void gmap_set_cmma_all_dirty(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; + gfn_t gfn = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + cond_resched(); + } while (gfn); +} + +static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) +{ + unsigned long align = PAGE_SIZE; + gpa_t gaddr = gfn_to_gpa(r_gfn); + union crste *crstep; + union crste crste; + union pte *ptep; + + if (level > TABLE_TYPE_PAGE_TABLE) + align = 1UL << (11 * level + _SEGMENT_SHIFT); + kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); + sg->invalidated = true; + if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) + return; + if (ptep) { + if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) + dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); + return; + } + + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); + if (crste_leaf(crste) || crste.h.i) + return; + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); +} + +static void gmap_unshadow(struct gmap *sg) +{ + struct gmap_cache *gmap_cache, *next; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + KVM_BUG_ON(!sg->parent, sg->kvm); + + lockdep_assert_held(&sg->parent->children_lock); + + gmap_remove_child(sg); + kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); + + list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { + gmap_cache->gmap = NULL; + list_del(&gmap_cache->list); + } + + gmap_put(sg); +} + +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct gmap *sg, *next; + gfn_t start, end; + + list_for_each_entry_safe(sg, next, &parent->children, list) { + start = sg->guest_asce.rsto; + end = start + sg->guest_asce.tl + 1; + if (!sg->guest_asce.r && gfn >= start && gfn < end) { + gmap_unshadow(sg); + continue; + } + scoped_guard(spinlock, &sg->host_to_rmap_lock) + head = radix_tree_delete(&sg->host_to_rmap, gfn); + gmap_for_each_rmap_safe(rmap, rnext, head) + gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); + } +} + +/** + * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * Context: Called with parent->children_lock held. + * + * Return: The pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL. + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->children_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_is_shadow_valid(sg, asce, edat_level)) + continue; + return sg; + } + return NULL; +} + +#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) +struct gmap_protect_asce_top_level { + unsigned long seq; + struct guest_fault f[CRST_TABLE_PAGES]; +}; + +static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + struct gmap *parent; + int rc, i; + + guard(write_lock)(&sg->kvm->mmu_lock); + + if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; + sg->invalidated = false; + for (i = 0; i < CRST_TABLE_PAGES; i++) { + if (!context->f[i].valid) + continue; + rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, + TABLE_TYPE_REGION1 + 1, context->f[i].writable); + if (rc) + return rc; + } + gmap_add_child(sg->parent, sg); + } + + kvm_s390_release_faultin_array(sg->kvm, context->f, false); + return 0; +} + +static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + int rc; + + if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + rc = __gmap_protect_asce_top_level(mc, sg, context); + radix_tree_preload_end(); + } while (rc == -ENOMEM); + + return rc; +} + +static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) +{ + struct gmap_protect_asce_top_level context = {}; + union asce asce = sg->guest_asce; + int rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + + context.seq = sg->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); + if (rc > 0) + rc = -EFAULT; + if (!rc) + rc = _gmap_protect_asce_top_level(mc, sg, &context); + if (rc) + kvm_s390_release_faultin_array(sg->kvm, context.f, true); + return rc; +} + +/** + * gmap_create_shadow() - Create/find a shadow guest address space. + * @mc: The cache to use to allocate dat tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * The returned shadow gmap will be returned with one extra reference. + * + * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, + union asce asce, int edat_level) +{ + struct gmap *sg, *new; + int rc; + + if (WARN_ON(!parent)) + return ERR_PTR(-EINVAL); + + scoped_guard(spinlock, &parent->children_lock) { + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_get(sg); + return sg; + } + } + /* Create a new shadow gmap. */ + new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); + if (!new) + return ERR_PTR(-ENOMEM); + new->guest_asce = asce; + new->edat_level = edat_level; + set_bit(GMAP_FLAG_SHADOW, &new->flags); + + scoped_guard(spinlock, &parent->children_lock) { + /* Recheck if another CPU created the same shadow. */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_put(new); + gmap_get(sg); + return sg; + } + if (asce.r) { + /* Only allow one real-space gmap shadow. */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->guest_asce.r) { + scoped_guard(write_lock, &parent->kvm->mmu_lock) + gmap_unshadow(sg); + break; + } + } + gmap_add_child(parent, new); + /* Nothing to protect, return right away. */ + gmap_get(new); + return new; + } + } + + gmap_get(new); + new->parent = parent; + /* Protect while inserting, protects against invalidation races. */ + rc = gmap_protect_asce_top_level(mc, new); + if (rc) { + new->parent = NULL; + gmap_put(new); + gmap_put(new); + return ERR_PTR(rc); + } + return new; +} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index c8f031c9ea5f..96ee1395a592 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -10,30 +10,246 @@ #ifndef ARCH_KVM_S390_GMAP_H #define ARCH_KVM_S390_GMAP_H -#define GMAP_SHADOW_FAKE_TABLE 1ULL +#include "dat.h" -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); +/** + * enum gmap_flags - Flags of a gmap. + * + * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap. + * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0 + * only for ucontrol per-cpu gmaps, since they + * share the page tables with the main gmap. + * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap). + * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap. + * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys. + * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA. + * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping. + */ +enum gmap_flags { + GMAP_FLAG_SHADOW = 0, + GMAP_FLAG_OWNS_PAGETABLES, + GMAP_FLAG_IS_UCONTROL, + GMAP_FLAG_ALLOW_HPAGE_1M, + GMAP_FLAG_ALLOW_HPAGE_2G, + GMAP_FLAG_PFAULT_ENABLED, + GMAP_FLAG_USES_SKEYS, + GMAP_FLAG_USES_CMM, + GMAP_FLAG_EXPORT_ON_UNMAP, +}; /** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation + * struct gmap_struct - Guest address space. * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. + * @flags: GMAP_FLAG_* flags. + * @edat_level: The edat level of this shadow gmap. + * @kvm: The vm. + * @asce: The ASCE used by this gmap. + * @list: List head used in children gmaps for the children gmap list. + * @children_lock: Protects children and scb_users. + * @children: List of child gmaps of this gmap. + * @scb_users: List of vsie_scb that use this shadow gmap. + * @parent: Parent gmap of a child gmap. + * @guest_asce: Original ASCE of this shadow gmap. + * @host_to_rmap_lock: Protects host_to_rmap. + * @host_to_rmap: Radix tree mapping host addresses to guest addresses. + */ +struct gmap { + unsigned long flags; + unsigned char edat_level; + bool invalidated; + struct kvm *kvm; + union asce asce; + struct list_head list; + spinlock_t children_lock; /* Protects: children, scb_users */ + struct list_head children; + struct list_head scb_users; + struct gmap *parent; + union asce guest_asce; + spinlock_t host_to_rmap_lock; /* Protects host_to_rmap */ + struct radix_tree_root host_to_rmap; + refcount_t refcount; +}; + +struct gmap_cache { + struct list_head list; + struct gmap *gmap; +}; + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + +int s390_replace_asce(struct gmap *gmap); +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint); +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end); +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end); +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault); +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit); +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit); +void gmap_remove_child(struct gmap *child); +void gmap_dispose(struct gmap *gmap); +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault, + struct kvm_memory_slot *slot); +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end); +int gmap_set_limit(struct gmap *gmap, gfn_t limit); +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr); +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count); +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count); +int gmap_enable_skeys(struct gmap *gmap); +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible); +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level); +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr); +void gmap_set_cmma_all_dirty(struct gmap *gmap); +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn); +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + union asce asce, int edat_level); +void gmap_split_huge_pages(struct gmap *gmap); + +static inline bool uses_skeys(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); +} + +static inline bool uses_cmm(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags); +} + +static inline bool pfault_enabled(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags); +} + +static inline bool is_ucontrol(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags); +} + +static inline bool is_shadow(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_SHADOW, &gmap->flags); +} + +static inline bool owns_page_tables(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); +} + +static inline struct gmap *gmap_put(struct gmap *gmap) +{ + if (refcount_dec_and_test(&gmap->refcount)) + gmap_dispose(gmap); + return NULL; +} + +static inline void gmap_get(struct gmap *gmap) +{ + WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount))); +} + +static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + scoped_guard(spinlock, &parent->children_lock) + _gmap_handle_vsie_unshadow_event(parent, gfn); +} + +static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, true); +} + +static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, false); +} + +static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn, bool needs_lock) +{ + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + else + lockdep_assert_not_held(&gmap->children_lock); + + if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) { + pgste.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + 1); + } + if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) { + pgste.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + if (!ptep->s.d && newpte.s.d && !newpte.s.s) + SetPageDirty(pfn_to_page(newpte.h.pfra)); + return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); +} + +static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn) +{ + return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); +} + +static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn, bool needs_lock) +{ + unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES; + + if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm)) + return true; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + + gfn = ALIGN_DOWN(gfn, align); + if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) { + newcrste.s.fc1.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + align); + } + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && + (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { + newcrste.s.fc1.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); + return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); +} + +static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn) +{ + return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true); +} + +/** + * gmap_is_shadow_valid() - check if a shadow guest address space matches the + * given properties and is still valid. + * @sg: Pointer to the shadow guest address space structure. + * @asce: ASCE for which the shadow table is requested. + * @edat_level: Edat level to be used for the shadow translation. * + * Return: true if the gmap shadow is still valid and matches the given + * properties and the caller can continue using it; false otherwise, the + * caller has to request a new shadow gmap in this case. */ -static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level) { - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; + return sg->guest_asce.val == asce.val && sg->edat_level == edat_level; } -#endif +#endif /* ARCH_KVM_S390_GMAP_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 80879fc73c90..69835e1d4f20 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -232,18 +232,14 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, } if (nr_wp > 0) { - wp_info = kmalloc_array(nr_wp, - sizeof(*wp_info), - GFP_KERNEL_ACCOUNT); + wp_info = kmalloc_objs(*wp_info, nr_wp, GFP_KERNEL_ACCOUNT); if (!wp_info) { ret = -ENOMEM; goto error; } } if (nr_bp > 0) { - bp_info = kmalloc_array(nr_bp, - sizeof(*bp_info), - GFP_KERNEL_ACCOUNT); + bp_info = kmalloc_objs(*bp_info, nr_bp, GFP_KERNEL_ACCOUNT); if (!bp_info) { ret = -ENOMEM; goto error; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 610dd44a948b..39aff324203e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -21,7 +21,7 @@ #include "gaccess.h" #include "trace.h" #include "trace-s390.h" -#include "gmap.h" +#include "faultin.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -95,7 +95,7 @@ static int handle_validity(struct kvm_vcpu *vcpu) vcpu->stat.exit_validity++; trace_kvm_s390_intercept_validity(vcpu, viwhy); - KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy, + KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy, current->pid, vcpu->kvm); /* do not warn on invalid runtime instrumentation mode */ @@ -368,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false); + } while (rc == -EAGAIN); + if (rc) return rc; /* Ensure that the source is paged-in, no actual access -> no key checking */ @@ -377,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true); + } while (rc == -EAGAIN); + if (rc) return rc; kvm_s390_retry_instr(vcpu); @@ -472,6 +478,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0xb256) return handle_sthyi(vcpu); + if (vcpu->kvm->arch.user_operexec) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) return -EOPNOTSUPP; rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); @@ -545,7 +554,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) guest_uvcb->header.cmd); return 0; } - rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb); /* * If the unpin did not succeed, the guest will exit again for the UVC * and we will retry the unpin. @@ -653,10 +662,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) break; case ICPT_PV_PREF: rc = 0; - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu)); - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu)); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE); break; default: return -EOPNOTSUPP; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2811a6c093b8..07f59c3b9a7b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -7,13 +7,13 @@ * Author(s): Carsten Otte <cotte@de.ibm.com> */ -#define KMSG_COMPONENT "kvm-s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "kvm-s390: " fmt #include <linux/cpufeature.h> #include <linux/interrupt.h> #include <linux/kvm_host.h> #include <linux/hrtimer.h> +#include <linux/export.h> #include <linux/mmu_context.h> #include <linux/nospec.h> #include <linux/signal.h> @@ -26,7 +26,6 @@ #include <linux/uaccess.h> #include <asm/sclp.h> #include <asm/isc.h> -#include <asm/gmap.h> #include <asm/nmi.h> #include <asm/airq.h> #include <asm/tpi.h> @@ -34,6 +33,7 @@ #include "gaccess.h" #include "trace-s390.h" #include "pci.h" +#include "gmap.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -44,70 +44,34 @@ static struct kvm_s390_gib *gib; /* handle external calls via sigp interpretation facility */ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) { - int c, scn; + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl; if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND)) return 0; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl sigp_ctrl = - sca->cpu[vcpu->vcpu_id].sigp_ctrl; - - c = sigp_ctrl.c; - scn = sigp_ctrl.scn; - } - read_unlock(&vcpu->kvm->arch.sca_lock); if (src_id) - *src_id = scn; + *src_id = sigp_ctrl.scn; - return c; + return sigp_ctrl.c; } static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; + union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1}; int expect, rc; BUG_ON(!kvm_s390_use_sca_entries()); - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val; - - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; - - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val; - old_val = READ_ONCE(*sigp_ctrl); - new_val.scn = src_id; - new_val.c = 1; - old_val.c = 0; + old_val = READ_ONCE(*sigp_ctrl); + old_val.c = 0; - expect = old_val.value; - rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); - } - read_unlock(&vcpu->kvm->arch.sca_lock); + expect = old_val.value; + rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value); if (rc != expect) { /* another external call is pending */ @@ -119,24 +83,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { + struct esca_block *sca = vcpu->kvm->arch.sca; + union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl; + if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - union esca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - WRITE_ONCE(sigp_ctrl->value, 0); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - union bsca_sigp_ctrl *sigp_ctrl = - &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - - WRITE_ONCE(sigp_ctrl->value, 0); - } - read_unlock(&vcpu->kvm->arch.sca_lock); + WRITE_ONCE(sigp_ctrl->value, 0); } int psw_extint_disabled(struct kvm_vcpu *vcpu) @@ -1002,6 +956,9 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); spin_unlock(&fi->lock); + if (!ext.ext_params) + return 0; + VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", ext.ext_params); vcpu->stat.deliver_service_signal++; @@ -1223,7 +1180,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - if (!sclp.has_sigpif) + if (!kvm_s390_use_sca_entries()) return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); return sca_ext_call_pending(vcpu, NULL); @@ -1322,6 +1279,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime); no_timer: kvm_vcpu_srcu_read_unlock(vcpu); + vcpu->kvm->arch.float_int.last_sleep_cpu = vcpu->vcpu_idx; kvm_vcpu_halt(vcpu); vcpu->valid_wakeup = false; __unset_cpu_idle(vcpu); @@ -1547,7 +1505,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) + if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) @@ -1794,7 +1752,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, goto out; } gisa_out: - tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + tmp_inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT); if (tmp_inti) { tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0); tmp_inti->io.io_int_word = isc_to_int_word(isc); @@ -1948,18 +1906,15 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) if (!online_vcpus) return; - /* find idle VCPUs first, then round robin */ - sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus); - if (sigcpu == online_vcpus) { - do { - sigcpu = kvm->arch.float_int.next_rr_cpu++; - kvm->arch.float_int.next_rr_cpu %= online_vcpus; - /* avoid endless loops if all vcpus are stopped */ - if (nr_tries++ >= online_vcpus) - return; - } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu))); + for (sigcpu = kvm->arch.float_int.last_sleep_cpu; ; sigcpu++) { + sigcpu %= online_vcpus; + dst_vcpu = kvm_get_vcpu(kvm, sigcpu); + if (!is_vcpu_stopped(dst_vcpu)) + break; + /* avoid endless loops if all vcpus are stopped */ + if (nr_tries++ >= online_vcpus) + return; } - dst_vcpu = kvm_get_vcpu(kvm, sigcpu); /* make the VCPU drop out of the SIE, or wake it up if sleeping */ switch (type) { @@ -2016,7 +1971,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti; int rc; - inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -2422,7 +2377,7 @@ static int enqueue_floating_irq(struct kvm_device *dev, return -EINVAL; while (len >= sizeof(struct kvm_s390_irq)) { - inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); + inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -2470,7 +2425,7 @@ static int register_io_adapter(struct kvm_device *dev, if (dev->kvm->arch.adapters[adapter_info.id] != NULL) return -EINVAL; - adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT); + adapter = kzalloc_obj(*adapter, GFP_KERNEL_ACCOUNT); if (!adapter) return -ENOMEM; @@ -2680,12 +2635,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_APF_ENABLE: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 1; + set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); break; case KVM_DEV_FLIC_APF_DISABLE_WAIT: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 0; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); /* * Make sure no async faults are in transition when * clearing the queues. So we don't need to worry @@ -2772,17 +2727,27 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) bit = bit_nr + (addr % PAGE_SIZE) * 8; + /* kvm_set_routing_entry() should never allow this to happen */ + WARN_ON_ONCE(bit > (PAGE_SIZE * BITS_PER_BYTE - 1)); + return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; } static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { + struct mm_struct *mm = kvm->mm; struct page *page = NULL; + int locked = 1; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE, + &page, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } - mmap_read_lock(kvm->mm); - get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL); - mmap_read_unlock(kvm->mm); return page; } @@ -2809,13 +2774,13 @@ static int adapter_indicators_set(struct kvm *kvm, bit = get_ind_bit(adapter_int->ind_addr, adapter_int->ind_offset, adapter->swap); set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT); set_page_dirty_lock(ind_page); map = page_address(summary_page); bit = get_ind_bit(adapter_int->summary_addr, adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT); set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); @@ -2865,6 +2830,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, int rc; mci.val = mcck_info->mcic; + + /* log machine checks being reinjected on all debugs */ + VCPU_EVENT(vcpu, 2, "guest machine check %lx", mci.val); + KVM_EVENT(2, "guest machine check %lx", mci.val); + pr_info("guest machine check pid %d: %lx", current->pid, mci.val); + if (mci.sr) cr14 |= CR14_RECOVERY_SUBMASK; if (mci.dg) @@ -2893,6 +2864,7 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { + const struct kvm_irq_routing_s390_adapter *adapter; u64 uaddr_s, uaddr_i; int idx; @@ -2903,6 +2875,14 @@ int kvm_set_routing_entry(struct kvm *kvm, return -EINVAL; e->set = set_adapter_int; + adapter = &ue->u.adapter; + if (adapter->summary_addr + (adapter->summary_offset / 8) >= + (adapter->summary_addr & PAGE_MASK) + PAGE_SIZE) + return -EINVAL; + if (adapter->ind_addr + (adapter->ind_offset / 8) >= + (adapter->ind_addr & PAGE_MASK) + PAGE_SIZE) + return -EINVAL; + idx = srcu_read_lock(&kvm->srcu); uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr); uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr); @@ -2911,7 +2891,9 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; e->adapter.summary_addr = uaddr_s; + e->adapter.summary_gaddr = ue->u.adapter.summary_addr; e->adapter.ind_addr = uaddr_i; + e->adapter.ind_gaddr = ue->u.adapter.ind_addr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; @@ -3161,7 +3143,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm) if (!gi->origin) return; gisa_clear_ipm(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin); } void kvm_s390_gisa_init(struct kvm *kvm) @@ -3177,7 +3159,7 @@ void kvm_s390_gisa_init(struct kvm *kvm) hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL); memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)virt_to_phys(gi->origin); - VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); + VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin); } void kvm_s390_gisa_enable(struct kvm *kvm) @@ -3218,7 +3200,7 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; - VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); + VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa); } void kvm_s390_gisa_disable(struct kvm *kvm) @@ -3467,7 +3449,7 @@ int __init kvm_s390_gib_init(u8 nisc) } } - KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); + KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc); goto out; out_unreg_gal: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fff863734975..e09960c2e6ed 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -10,10 +10,11 @@ * Jason J. Herne <jjherne@us.ibm.com> */ -#define KMSG_COMPONENT "kvm-s390" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "kvm-s390: " fmt #include <linux/compiler.h> +#include <linux/entry-virt.h> +#include <linux/export.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/hrtimer.h> @@ -39,7 +40,7 @@ #include <asm/lowcore.h> #include <asm/machine.h> #include <asm/stp.h> -#include <asm/gmap.h> +#include <asm/gmap_helpers.h> #include <asm/nmi.h> #include <asm/isc.h> #include <asm/sclp.h> @@ -51,8 +52,9 @@ #include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" -#include "pci.h" #include "gmap.h" +#include "faultin.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -63,7 +65,7 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) -const struct _kvm_stats_desc kvm_vm_stats_desc[] = { +const struct kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, inject_io), STATS_DESC_COUNTER(VM, inject_float_mchk), @@ -89,7 +91,7 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; -const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { +const struct kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, exit_userspace), STATS_DESC_COUNTER(VCPU, exit_null), @@ -184,7 +186,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), - STATS_DESC_COUNTER(VCPU, pfault_sync) + STATS_DESC_COUNTER(VCPU, pfault_sync), + STATS_DESC_COUNTER(VCPU, signal_exits) }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -262,17 +265,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS) /* available subfunctions indicated via query / "test bit" */ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; -static struct gmap_notifier gmap_notifier; -static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ /* forward declarations */ -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); -static int sca_switch_to_extended(struct kvm *kvm); - static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { u8 delta_idx = 0; @@ -355,7 +352,7 @@ static __always_inline void pfcr_query(u8 (*query)[16]) { asm volatile( " lghi 0,0\n" - " .insn rsy,0xeb0000000016,0,0,%[query]\n" + " .insn rsy,0xeb0000000016,0,0,%[query]" : [query] "=QS" (*query) : : "cc", "0"); @@ -367,7 +364,7 @@ static __always_inline void __sortl_query(u8 (*query)[32]) " lghi 0,0\n" " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rre,0xb9380000,2,4\n" + " .insn rre,0xb9380000,2,4" : [query] "=R" (*query) : : "cc", "0", "1"); @@ -379,7 +376,7 @@ static __always_inline void __dfltcc_query(u8 (*query)[32]) " lghi 0,0\n" " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rrf,0xb9390000,2,4,6,0\n" + " .insn rrf,0xb9390000,2,4,6,0" : [query] "=R" (*query) : : "cc", "0", "1"); @@ -528,10 +525,6 @@ static int __init __kvm_s390_init(void) if (rc) goto err_gib; - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -551,8 +544,6 @@ err_kvm_uv: static void __kvm_s390_exit(void) { - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -563,12 +554,43 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } +static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op, + unsigned long addr, union skey skey) +{ + union asce asce = kvm->arch.gmap->asce; + gfn_t gfn = gpa_to_gfn(addr); + int r; + + guard(read_lock)(&kvm->mmu_lock); + + switch (op) { + case KVM_S390_KEYOP_SSKE: + r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0); + if (r >= 0) + return skey.skey; + break; + case KVM_S390_KEYOP_ISKE: + r = dat_get_storage_key(asce, gfn, &skey); + if (!r) + return skey.skey; + break; + case KVM_S390_KEYOP_RRBE: + r = dat_reset_reference_bit(asce, gfn); + if (r > 0) + return r << 1; + break; + default: + return -EINVAL; + } + return r; +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { if (ioctl == KVM_S390_ENABLE_SIE) - return s390_enable_sie(); + return 0; return -EINVAL; } @@ -579,7 +601,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_S390_PSW: case KVM_CAP_S390_GMAP: - case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif @@ -606,6 +627,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_S390_USER_OPEREXEC: + case KVM_CAP_S390_KEYOP: + case KVM_CAP_S390_VSIE_ESAMODE: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -631,11 +655,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: - r = KVM_S390_BSCA_CPU_SLOTS; + /* + * Return the same value for KVM_CAP_MAX_VCPUS and + * KVM_CAP_MAX_VCPU_ID to conform with the KVM API. + */ + r = KVM_S390_ESCA_CPU_SLOTS; if (!kvm_s390_use_sca_entries()) r = KVM_MAX_VCPUS; - else if (sclp.has_esca && sclp.has_64bscao) - r = KVM_S390_ESCA_CPU_SLOTS; if (ext == KVM_CAP_NR_VCPUS) r = min_t(unsigned int, num_online_cpus(), r); break; @@ -694,32 +720,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - int i; - gfn_t cur_gfn, last_gfn; - unsigned long gaddr, vmaddr; - struct gmap *gmap = kvm->arch.gmap; - DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); - - /* Loop over all guest segments */ - cur_gfn = memslot->base_gfn; - last_gfn = memslot->base_gfn + memslot->npages; - for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { - gaddr = gfn_to_gpa(cur_gfn); - vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); - if (kvm_is_error_hva(vmaddr)) - continue; - - bitmap_zero(bitmap, _PAGE_ENTRIES); - gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); - for (i = 0; i < _PAGE_ENTRIES; i++) { - if (test_bit(i, bitmap)) - mark_page_dirty(kvm, cur_gfn + i); - } + gfn_t last_gfn = memslot->base_gfn + memslot->npages; - if (fatal_signal_pending(current)) - return; - cond_resched(); - } + scoped_guard(read_lock, &kvm->mmu_lock) + gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn); } /* Section: vm related */ @@ -879,9 +883,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; - mmap_write_lock(kvm->mm); - kvm->mm->context.allow_gmap_hpage_1m = 1; - mmap_write_unlock(kvm->mm); + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -919,6 +921,17 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_USER_OPEREXEC: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC"); + kvm->arch.user_operexec = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; + case KVM_CAP_S390_VSIE_ESAMODE: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_VSIE_ESAMODE"); + kvm->arch.allow_vsie_esamode = 1; + r = 0; + break; default: r = -EINVAL; break; @@ -948,7 +961,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; - unsigned int idx; + switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: ret = -ENXIO; @@ -959,8 +972,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_lock(&kvm->lock); if (kvm->created_vcpus) ret = -EBUSY; - else if (kvm->mm->context.allow_gmap_hpage_1m) - ret = -EINVAL; else { kvm->arch.use_cmma = 1; /* Not compatible with cmma. */ @@ -969,7 +980,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } mutex_unlock(&kvm->lock); break; - case KVM_S390_VM_MEM_CLR_CMMA: + case KVM_S390_VM_MEM_CLR_CMMA: { + gfn_t start_gfn = 0; + ret = -ENXIO; if (!sclp.has_cmma) break; @@ -978,13 +991,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - s390_reset_cmma(kvm->arch.gmap->mm); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); + do { + start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); + cond_resched(); + } while (start_gfn); ret = 0; break; + } case KVM_S390_VM_MEM_LIMIT_SIZE: { unsigned long new_limit; @@ -1001,29 +1014,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_create takes last usable address */ - if (new_limit != KVM_S390_NO_MEM_LIMIT) - new_limit -= 1; - ret = -EBUSY; - mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { - /* gmap_create will round the limit up */ - struct gmap *new = gmap_create(current->mm, new_limit); - - if (!new) { - ret = -ENOMEM; - } else { - gmap_remove(kvm->arch.gmap); - new->private = kvm; - kvm->arch.gmap = new; - ret = 0; - } - } - mutex_unlock(&kvm->lock); + if (!kvm->created_vcpus) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%pK", - (void *) kvm->arch.gmap->asce); + VM_EVENT(kvm, 3, "New guest asce: 0x%p", + (void *)kvm->arch.gmap->asce.val); break; } default: @@ -1188,19 +1184,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) kvm->arch.migration_mode = 1; return 0; } - /* mark all the pages in active slots as dirty */ kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; - /* - * The second half of the bitmap is only used on x86, - * and would be wasted otherwise, so we put it to good - * use here to keep track of the state of the storage - * attributes. - */ - memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); ram_pages += ms->npages; } + /* mark all the pages as dirty */ + gmap_set_cmma_all_dirty(kvm->arch.gmap); atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); kvm->arch.migration_mode = 1; kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); @@ -1433,7 +1423,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) ret = -EBUSY; goto out; } - proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); + proc = kzalloc_obj(*proc, GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1633,7 +1623,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_processor *proc; int ret = 0; - proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); + proc = kzalloc_obj(*proc, GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1661,7 +1651,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_machine *mach; int ret = 0; - mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT); + mach = kzalloc_obj(*mach, GFP_KERNEL_ACCOUNT); if (!mach) { ret = -ENOMEM; goto out; @@ -1930,22 +1920,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) * Updates the Multiprocessor Topology-Change-Report bit to signal * the guest with a topology change. * This is only relevant if the topology facility is present. - * - * The SCA version, bsca or esca, doesn't matter as offset is the same. */ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) { union sca_utility new, old; - struct bsca_block *sca; + struct esca_block *sca; - read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; old = READ_ONCE(sca->utility); do { new = old; new.mtcr = val; } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); - read_unlock(&kvm->arch.sca_lock); } static int kvm_s390_set_topo_change_indication(struct kvm *kvm, @@ -1966,9 +1952,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm, if (!test_kvm_facility(kvm, 11)) return -ENXIO; - read_lock(&kvm->arch.sca_lock); - topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; - read_unlock(&kvm->arch.sca_lock); + topo = kvm->arch.sca->utility.mtcr; return put_user(topo, (u8 __user *)attr->addr); } @@ -2112,40 +2096,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; /* Is this guest using storage keys? */ - if (!mm_uses_skeys(current->mm)) + if (!uses_skeys(kvm->arch.gmap)) return KVM_S390_GET_SKEYS_NONE; /* Enforce sane limit on memory allocation */ if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0; i < args->count; i++) { + r = dat_get_storage_key(kvm->arch.gmap->asce, + args->start_gfn + i, keys + i); + if (r) + break; } - - r = get_guest_storage_key(current->mm, hva, &keys[i]); - if (r) - break; } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); if (!r) { r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, @@ -2160,10 +2136,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; - bool unlocked; + struct kvm_s390_mmu_cache *mc; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; @@ -2172,7 +2147,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; @@ -2184,159 +2159,41 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } /* Enable storage key handling for the guest */ - r = s390_enable_skey(); + r = gmap_enable_skeys(kvm->arch.gmap); if (r) goto out; - i = 0; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - while (i < args->count) { - unlocked = false; - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; - } - + r = -EINVAL; + for (i = 0; i < args->count; i++) { /* Lowest order bit is reserved */ - if (keys[i] & 0x01) { - r = -EINVAL; - break; - } - - r = set_guest_storage_key(current->mm, hva, keys[i], 0); - if (r) { - r = fixup_user_fault(current->mm, hva, - FAULT_FLAG_WRITE, &unlocked); - if (r) - break; - } - if (!r) - i++; - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); -out: - kvfree(keys); - return r; -} - -/* - * Base address and length must be sent at the start of each block, therefore - * it's cheaper to send some clean data, as long as it's less than the size of - * two longs. - */ -#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) -/* for consistency */ -#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) - -static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long pgstev, hva, cur_gfn = args->start_gfn; - - args->count = 0; - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - /* - * We return an error if the first value was invalid, but we - * return successfully if at least one value was copied. - */ - if (kvm_is_error_hva(hva)) - return args->count ? 0 : -EFAULT; - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - res[args->count++] = (pgstev >> 24) & 0x43; - cur_gfn++; - } - - return 0; -} - -static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, - gfn_t gfn) -{ - return ____gfn_to_memslot(slots, gfn, true); -} - -static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, - unsigned long cur_gfn) -{ - struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); - unsigned long ofs = cur_gfn - ms->base_gfn; - struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; - - if (ms->base_gfn + ms->npages <= cur_gfn) { - mnode = rb_next(mnode); - /* If we are above the highest slot, wrap around */ - if (!mnode) - mnode = rb_first(&slots->gfn_tree); - - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = 0; + if (keys[i].zero) + goto out; } - if (cur_gfn < ms->base_gfn) - ofs = 0; - - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); - while (ofs >= ms->npages && (mnode = rb_next(mnode))) { - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + r = -ENOMEM; + goto out; } - return ms->base_gfn + ofs; -} - -static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *ms; - if (unlikely(kvm_memslots_empty(slots))) - return 0; - - cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); - ms = gfn_to_memslot(kvm, cur_gfn); - args->count = 0; - args->start_gfn = cur_gfn; - if (!ms) - return 0; - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - mem_end = kvm_s390_get_gfn_end(slots); - - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - if (kvm_is_error_hva(hva)) - return 0; - /* Decrement only if we actually flipped the bit to 0 */ - if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_dec(&kvm->arch.cmma_dirty_pages); - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - /* Save the value */ - res[args->count++] = (pgstev >> 24) & 0x43; - /* If the next bit is too far away, stop. */ - if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) - return 0; - /* If we reached the previous "next", find the next one */ - if (cur_gfn == next_gfn) - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - /* Reached the end of memory or of the buffer, stop */ - if ((next_gfn >= mem_end) || - (next_gfn - args->start_gfn >= bufsize)) - return 0; - cur_gfn++; - /* Reached the end of the current memslot, take the next one. */ - if (cur_gfn - ms->base_gfn >= ms->npages) { - ms = gfn_to_memslot(kvm, cur_gfn); - if (!ms) - return 0; + r = 0; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r == -ENOMEM) + break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0 ; i < args->count; i++) { + r = dat_set_storage_key(mc, kvm->arch.gmap->asce, + args->start_gfn + i, keys[i], 0); + if (r) + break; + } } - } - return 0; + } while (r == -ENOMEM); + kvm_s390_free_mmu_cache(mc); +out: + kvfree(keys); + return r; } /* @@ -2350,8 +2207,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, static int kvm_s390_get_cmma_bits(struct kvm *kvm, struct kvm_s390_cmma_log *args) { - unsigned long bufsize; - int srcu_idx, peek, ret; + int peek, ret; u8 *values; if (!kvm->arch.use_cmma) @@ -2364,8 +2220,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (!peek && !kvm->arch.migration_mode) return -EINVAL; /* CMMA is disabled or was not used, or the buffer has length zero */ - bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); - if (!bufsize || !kvm->mm->context.uses_cmm) { + args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!args->count || !uses_cmm(kvm->arch.gmap)) { memset(args, 0, sizeof(*args)); return 0; } @@ -2375,18 +2231,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, return 0; } - values = vmalloc(bufsize); + values = vmalloc(args->count); if (!values) return -ENOMEM; - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - if (peek) - ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); - else - ret = kvm_s390_get_cmma(kvm, args, values, bufsize); - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); + scoped_guard(read_lock, &kvm->mmu_lock) { + if (peek) + ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count, + values); + else + ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count, + values, &kvm->arch.cmma_dirty_pages); + } if (kvm->arch.migration_mode) args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); @@ -2408,11 +2264,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, static int kvm_s390_set_cmma_bits(struct kvm *kvm, const struct kvm_s390_cmma_log *args) { - unsigned long hva, mask, pgstev, i; - uint8_t *bits; - int srcu_idx, r = 0; - - mask = args->mask; + struct kvm_s390_mmu_cache *mc; + u8 *bits = NULL; + int r = 0; if (!kvm->arch.use_cmma) return -ENXIO; @@ -2426,9 +2280,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, if (args->count == 0) return 0; + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; bits = vmalloc(array_size(sizeof(*bits), args->count)); if (!bits) - return -ENOMEM; + goto out; r = copy_from_user(bits, (void __user *)args->values, args->count); if (r) { @@ -2436,29 +2293,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, goto out; } - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r) break; + scoped_guard(read_lock, &kvm->mmu_lock) { + r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn, + args->count, args->mask, bits); } + } while (r == -ENOMEM); - pgstev = bits[i]; - pgstev = pgstev << 24; - mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; - set_pgste_bits(kvm->mm, hva, mask, pgstev); - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); - - if (!kvm->mm->context.uses_cmm) { - mmap_write_lock(kvm->mm); - kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags); out: + kvm_s390_free_mmu_cache(mc); vfree(bits); return r; } @@ -2666,15 +2513,16 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (kvm_s390_pv_is_protected(kvm)) break; + mmap_write_lock(kvm->mm); /* - * FMT 4 SIE needs esca. As we never switch back to bsca from - * esca, we need no cleanup in the error cases below + * Disable creation of new THPs. Existing THPs can stay, they + * will be split when any part of them gets imported. */ - r = sca_switch_to_extended(kvm); - if (r) - break; - - r = s390_disable_cow_sharing(); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm); + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); + r = gmap_helper_disable_cow_sharing(); + mmap_write_unlock(kvm->mm); if (r) break; @@ -2746,9 +2594,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (copy_from_user(&parms, argp, sizeof(parms))) break; - /* Currently restricted to 8KB */ + /* Currently restricted to 1MiB */ r = -EINVAL; - if (parms.length > PAGE_SIZE * 2) + if (parms.length > SZ_1M) break; r = -ENOMEM; @@ -2902,9 +2750,9 @@ static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_fla static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; - int r, srcu_idx; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | KVM_S390_MEMOP_F_CHECK_ONLY); @@ -2917,52 +2765,32 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -ENOMEM; } - srcu_idx = srcu_read_lock(&kvm->srcu); + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; - } + scoped_guard(srcu, &kvm->srcu) { + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) + return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - goto out_unlock; - } - if (acc_mode == GACC_FETCH) { + if (acc_mode == GACC_STORE && copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_FETCH, mop->key); + mop->size, acc_mode, mop->key); if (r) - goto out_unlock; - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_unlock; - } - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_STORE, mop->key); + return r; + if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); - - vfree(tmpbuf); - return r; + return 0; } static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void __user *old_addr = (void __user *)mop->old_addr; - union { - __uint128_t quad; - char raw[sizeof(__uint128_t)]; - } old = { .quad = 0}, new = { .quad = 0 }; - unsigned int off_in_quad = sizeof(new) - mop->size; - int r, srcu_idx; - bool success; + union kvm_s390_quad old = { .sixteen = 0 }; + union kvm_s390_quad new = { .sixteen = 0 }; + bool success = false; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); if (r) @@ -2974,25 +2802,18 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m */ if (mop->size > sizeof(new)) return -EINVAL; - if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + if (copy_from_user(&new, uaddr, mop->size)) return -EFAULT; - if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + if (copy_from_user(&old, old_addr, mop->size)) return -EFAULT; - srcu_idx = srcu_read_lock(&kvm->srcu); + scoped_guard(srcu, &kvm->srcu) { + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new, + mop->key, &success); - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; + if (!success && copy_to_user(old_addr, &old, mop->size)) + return -EFAULT; } - - r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, - new.quad, mop->key, &success); - if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) - r = -EFAULT; - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); return r; } @@ -3147,6 +2968,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = -EFAULT; break; } + case KVM_S390_KEYOP: { + struct kvm_s390_mmu_cache *mc; + struct kvm_s390_keyop kop; + union skey skey; + + if (copy_from_user(&kop, argp, sizeof(kop))) { + r = -EFAULT; + break; + } + skey.skey = kop.key; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey); + kvm_s390_free_mmu_cache(mc); + if (r < 0) + break; + + kop.key = r; + r = 0; + if (copy_to_user(argp, &kop, sizeof(kop))) + r = -EFAULT; + break; + } case KVM_S390_ZPCI_OP: { struct kvm_s390_zpci_op args; @@ -3314,10 +3161,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm) static void sca_dispose(struct kvm *kvm) { - if (kvm->arch.use_esca) - free_pages_exact(kvm->arch.sca, sizeof(struct esca_block)); - else - free_page((unsigned long)(kvm->arch.sca)); + free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca)); kvm->arch.sca = NULL; } @@ -3331,10 +3175,11 @@ void kvm_arch_free_vm(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; - int i, rc; + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO; char debug_name[16]; - static unsigned long sca_offset; + int i, rc; + + mutex_init(&kvm->arch.pv.import_lock); rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL @@ -3346,29 +3191,18 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type) goto out_err; #endif - - rc = s390_enable_sie(); - if (rc) - goto out_err; - rc = -ENOMEM; if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; - rwlock_init(&kvm->arch.sca_lock); - /* start with basic SCA */ - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); - if (!kvm->arch.sca) - goto out_err; mutex_lock(&kvm_lock); - sca_offset += 16; - if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) - sca_offset = 0; - kvm->arch.sca = (struct bsca_block *) - ((char *) kvm->arch.sca + sca_offset); + + kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags); mutex_unlock(&kvm_lock); + if (!kvm->arch.sca) + goto out_err; - sprintf(debug_name, "kvm-%u", current->pid); + snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid); kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); if (!kvm->arch.dbf) @@ -3430,6 +3264,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "vm created with type %lu", type); + kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1; + kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit)); + if (!kvm->arch.gmap) + goto out_err; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags); + if (type & KVM_VM_S390_UCONTROL) { struct kvm_userspace_memory_region2 fake_memslot = { .slot = KVM_S390_UCONTROL_MEMSLOT, @@ -3439,23 +3279,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) .flags = 0, }; - kvm->arch.gmap = NULL; - kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; /* one flat fake memslot covering the whole address-space */ mutex_lock(&kvm->slots_lock); KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); mutex_unlock(&kvm->slots_lock); + set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); } else { - if (sclp.hamax == U64_MAX) - kvm->arch.mem_limit = TASK_SIZE_MAX; - else - kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, - sclp.hamax + 1); - kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); - if (!kvm->arch.gmap) - goto out_err; - kvm->arch.gmap->private = kvm; - kvm->arch.gmap->pfault_enabled = 0; + struct crst_table *table = dereference_asce(kvm->arch.gmap->asce); + + crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val); } kvm->arch.use_pfmfi = sclp.has_pfmfi; @@ -3466,7 +3298,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_gisa_init(kvm); INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); kvm->arch.pv.set_aside = NULL; - KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); + KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid); return 0; out_err: @@ -3489,8 +3321,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); kvm_s390_update_topology_change_report(vcpu->kvm, 1); - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -3498,6 +3333,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (kvm_s390_pv_cpu_get_handle(vcpu)) kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); + kvm_s390_free_mmu_cache(vcpu->arch.mc); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -3524,154 +3360,48 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); - if (!kvm_is_ucontrol(kvm)) - gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); - KVM_EVENT(3, "vm 0x%pK destroyed", kvm); + kvm->arch.gmap = gmap_put(kvm->arch.gmap); + KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ -static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) -{ - vcpu->arch.gmap = gmap_create(current->mm, -1UL); - if (!vcpu->arch.gmap) - return -ENOMEM; - vcpu->arch.gmap->private = vcpu->kvm; - - return 0; -} - static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + struct esca_block *sca = vcpu->kvm->arch.sca; + if (!kvm_s390_use_sca_entries()) return; - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - sca->cpu[vcpu->vcpu_id].sda = 0; - } - read_unlock(&vcpu->kvm->arch.sca_lock); + clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = 0; } static void sca_add_vcpu(struct kvm_vcpu *vcpu) { - if (!kvm_s390_use_sca_entries()) { - phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); - - /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - return; - } - read_lock(&vcpu->kvm->arch.sca_lock); - if (vcpu->kvm->arch.use_esca) { - struct esca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); - } else { - struct bsca_block *sca = vcpu->kvm->arch.sca; - phys_addr_t sca_phys = virt_to_phys(sca); - - sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); - vcpu->arch.sie_block->scaoh = sca_phys >> 32; - vcpu->arch.sie_block->scaol = sca_phys; - set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); - } - read_unlock(&vcpu->kvm->arch.sca_lock); -} - -/* Basic SCA to Extended SCA data copy routines */ -static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s) -{ - d->sda = s->sda; - d->sigp_ctrl.c = s->sigp_ctrl.c; - d->sigp_ctrl.scn = s->sigp_ctrl.scn; -} - -static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s) -{ - int i; - - d->ipte_control = s->ipte_control; - d->mcn[0] = s->mcn; - for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++) - sca_copy_entry(&d->cpu[i], &s->cpu[i]); -} - -static int sca_switch_to_extended(struct kvm *kvm) -{ - struct bsca_block *old_sca = kvm->arch.sca; - struct esca_block *new_sca; - struct kvm_vcpu *vcpu; - unsigned long vcpu_idx; - u32 scaol, scaoh; - phys_addr_t new_sca_phys; - - if (kvm->arch.use_esca) - return 0; + struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!new_sca) - return -ENOMEM; - - new_sca_phys = virt_to_phys(new_sca); - scaoh = new_sca_phys >> 32; - scaol = new_sca_phys & ESCA_SCAOL_MASK; - - kvm_s390_vcpu_block_all(kvm); - write_lock(&kvm->arch.sca_lock); - - sca_copy_b_to_e(new_sca, old_sca); - - kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) { - vcpu->arch.sie_block->scaoh = scaoh; - vcpu->arch.sie_block->scaol = scaol; - vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - } - kvm->arch.sca = new_sca; - kvm->arch.use_esca = 1; + /* we still need the sca header for the ipte control */ + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; - write_unlock(&kvm->arch.sca_lock); - kvm_s390_vcpu_unblock_all(kvm); - - free_page((unsigned long)old_sca); + if (!kvm_s390_use_sca_entries()) + return; - VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)", - old_sca, kvm->arch.sca); - return 0; + set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn); + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); } static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { - int rc; - - if (!kvm_s390_use_sca_entries()) { - if (id < KVM_MAX_VCPUS) - return true; - return false; - } - if (id < KVM_S390_BSCA_CPU_SLOTS) - return true; - if (!sclp.has_esca || !sclp.has_64bscao) - return false; - - rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); + if (!kvm_s390_use_sca_entries()) + return id < KVM_MAX_VCPUS; - return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; + return id < KVM_S390_ESCA_CPU_SLOTS; } /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ @@ -3917,7 +3647,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= ECA_IB; if (sclp.has_siif) vcpu->arch.sie_block->eca |= ECA_SII; - if (sclp.has_sigpif) + if (kvm_s390_use_sca_entries()) vcpu->arch.sie_block->eca |= ECA_SIGPI; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= ECA_VX; @@ -3978,9 +3708,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int rc; BUILD_BUG_ON(sizeof(struct sie_page) != 4096); + vcpu->arch.mc = kvm_s390_new_mmu_cache(); + if (!vcpu->arch.mc) + return -ENOMEM; sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!sie_page) + if (!sie_page) { + kvm_s390_free_mmu_cache(vcpu->arch.mc); + vcpu->arch.mc = NULL; return -ENOMEM; + } vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); @@ -4022,12 +3758,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; if (kvm_is_ucontrol(vcpu->kvm)) { - rc = __kvm_ucontrol_vcpu_init(vcpu); - if (rc) + rc = -ENOMEM; + vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL); + if (!vcpu->arch.gmap) goto out_free_sie_block; } - VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p", vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); @@ -4039,8 +3776,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; out_ucontrol_uninit: - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); return rc; @@ -4104,32 +3843,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct kvm *kvm = gmap->private; - struct kvm_vcpu *vcpu; - unsigned long prefix; - unsigned long i; - - trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); - - if (gmap_is_shadow(gmap)) - return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - kvm_for_each_vcpu(i, vcpu, kvm) { - /* match against both prefix pages */ - prefix = kvm_s390_get_prefix(vcpu); - if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", - start, end); - kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); - } - } -} - bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ @@ -4364,8 +4077,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - int ret = 0; - vcpu_load(vcpu); vcpu->run->s.regs.fpc = fpu->fpc; @@ -4376,7 +4087,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); vcpu_put(vcpu); - return ret; + return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -4513,72 +4224,41 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } -static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) -{ - struct kvm *kvm = gmap->private; - gfn_t gfn = gpa_to_gfn(gaddr); - bool unlocked; - hva_t vmaddr; - gpa_t tmp; - int rc; - - if (kvm_is_ucontrol(kvm)) { - tmp = __gmap_translate(gmap, gaddr); - gfn = gpa_to_gfn(tmp); - } - - vmaddr = gfn_to_hva(kvm, gfn); - rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!rc) - rc = __gmap_link(gmap, gaddr, vmaddr); - return rc; -} - -/** - * __kvm_s390_mprotect_many() - Apply specified protection to guest pages - * @gmap: the gmap of the guest - * @gpa: the starting guest address - * @npages: how many pages to protect - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() - * - * Context: kvm->srcu and gmap->mm need to be held in read mode - */ -int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, - unsigned long bits) +static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr) { - unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - gpa_t end = gpa + npages * PAGE_SIZE; int rc; - for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { - rc = gmap_protect_one(gmap, gpa, prot, bits); - if (rc == -EAGAIN) { - __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); - rc = gmap_protect_one(gmap, gpa, prot, bits); + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr); + if (rc == -EREMOTE) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = *gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; } - if (rc < 0) - return rc; + return rc; } - return 0; } -static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu) { gpa_t gaddr = kvm_s390_get_prefix(vcpu); - int idx, rc; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - mmap_read_lock(vcpu->arch.gmap->mm); + gfn_t gfn; + int rc; - rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + gfn = gpa_to_gfn(gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true); + if (rc) + return rc; + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true); + if (rc) + return rc; + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn); return rc; } @@ -4598,7 +4278,7 @@ retry: if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = kvm_s390_mprotect_notify_prefix(vcpu); + rc = kvm_s390_fixup_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; @@ -4647,8 +4327,7 @@ retry: * Re-enable CMM virtualization if CMMA is available and * CMM has been used. */ - if ((vcpu->kvm->arch.use_cmma) && - (vcpu->kvm->mm->context.uses_cmm)) + if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap)) vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; goto retry; } @@ -4744,7 +4423,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return true; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; @@ -4760,7 +4439,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) return false; - if (!vcpu->arch.gmap->pfault_enabled) + if (!pfault_enabled(vcpu->arch.gmap)) return false; hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr); @@ -4784,9 +4463,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14]; vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15]; - if (need_resched()) - schedule(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); if (rc || guestdbg_exit_pending(vcpu)) @@ -4856,109 +4532,36 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) current->thread.gmap_int_code, current->thread.gmap_teid.val); } -/* - * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu - * @vcpu: the vCPU whose gmap is to be fixed up - * @gfn: the guest frame number used for memslots (including fake memslots) - * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @flags: FOLL_* flags - * - * Return: 0 on success, < 0 in case of error. - * Context: The mm lock must not be held before calling. May sleep. - */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) -{ - struct kvm_memory_slot *slot; - unsigned int fault_flags; - bool writable, unlocked; - unsigned long vmaddr; - struct page *page; - kvm_pfn_t pfn; +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr) +{ + struct guest_fault f = { + .write_attempt = wr, + .attempt_pfault = pfault_enabled(vcpu->arch.gmap), + }; int rc; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return vcpu_post_run_addressing_exception(vcpu); - - fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; - if (vcpu->arch.gmap->pfault_enabled) - flags |= FOLL_NOWAIT; - vmaddr = __gfn_to_hva_memslot(slot, gfn); - -try_again: - pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + f.gfn = gpa_to_gfn(gaddr); - /* Access outside memory, inject addressing exception */ - if (is_noslot_pfn(pfn)) + rc = kvm_s390_faultin_gfn(vcpu, NULL, &f); + if (rc <= 0) + return rc; + if (rc == PGM_ADDRESSING) return vcpu_post_run_addressing_exception(vcpu); - /* Signal pending: try again */ - if (pfn == KVM_PFN_ERR_SIGPENDING) - return -EAGAIN; - - /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ - if (pfn == KVM_PFN_ERR_NEEDS_IO) { - trace_kvm_s390_major_guest_pfault(vcpu); - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - /* Could not setup async pfault, try again synchronously */ - flags &= ~FOLL_NOWAIT; - goto try_again; - } - /* Any other error */ - if (is_error_pfn(pfn)) - return -EFAULT; - - /* Success */ - mmap_read_lock(vcpu->arch.gmap->mm); - /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ - rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); - if (!rc) - rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); - scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { - kvm_release_faultin_page(vcpu->kvm, page, false, writable); - } - mmap_read_unlock(vcpu->arch.gmap->mm); - return rc; -} - -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) -{ - unsigned long gaddr_tmp; - gfn_t gfn; - - gfn = gpa_to_gfn(gaddr); - if (kvm_is_ucontrol(vcpu->kvm)) { - /* - * This translates the per-vCPU guest address into a - * fake guest address, which can then be used with the - * fake memslots that are identity mapping userspace. - * This allows ucontrol VMs to use the normal fault - * resolution path, like normal VMs. - */ - mmap_read_lock(vcpu->arch.gmap->mm); - gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - if (gaddr_tmp == -EFAULT) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; - vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; - return -EREMOTE; - } - gfn = gpa_to_gfn(gaddr_tmp); - } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); + KVM_BUG_ON(rc, vcpu->kvm); + return -EINVAL; } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { - unsigned int flags = 0; + unsigned int foll = 0; unsigned long gaddr; int rc; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { case 0: @@ -4973,7 +4576,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) * previous protected guest. The old pages need to be destroyed * so the new guest can use them. */ - if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) { + if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) { /* * Either KVM messed up the secure guest mapping or the * same page is mapped into multiple secure guests. @@ -4995,12 +4598,12 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) * guest has not been imported yet. Try to import the page into * the protected guest. */ - rc = gmap_convert_to_secure(vcpu->arch.gmap, gaddr); + rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr); if (rc == -EINVAL) send_sig(SIGSEGV, current, 0); if (rc != -ENXIO) break; - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; fallthrough; case PGM_PROTECTION: case PGM_SEGMENT_TRANSLATION: @@ -5010,7 +4613,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: kvm_s390_assert_primary_as(vcpu); - return vcpu_dat_fault_handler(vcpu, gaddr, flags); + return vcpu_dat_fault_handler(vcpu, gaddr, foll); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); @@ -5020,7 +4623,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) return 0; } -static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) +static int vcpu_post_run(struct kvm_vcpu *vcpu, int sie_return) { struct mcck_volatile_info *mcck_info; struct sie_page *sie_page; @@ -5036,14 +4639,14 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14; vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15; - if (exit_reason == -EINTR) { - VCPU_EVENT(vcpu, 3, "%s", "machine check"); + if (sie_return == SIE64_RETURN_MCCK) { sie_page = container_of(vcpu->arch.sie_block, struct sie_page, sie_block); mcck_info = &sie_page->mcck_info; kvm_s390_reinject_machine_check(vcpu, mcck_info); return 0; } + WARN_ON_ONCE(sie_return != SIE64_RETURN_NORMAL); if (vcpu->arch.sie_block->icptcode > 0) { rc = kvm_handle_sie_intercept(vcpu); @@ -5060,10 +4663,29 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) return vcpu_post_run_handle_fault(vcpu); } +int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb, + u64 *gprs, unsigned long gasce) +{ + int ret; + + guest_state_enter_irqoff(); + + /* + * The guest_state_{enter,exit}_irqoff() functions inform lockdep and + * tracing that entry to the guest will enable host IRQs, and exit from + * the guest will disable host IRQs. + */ + ret = sie64a(scb, gprs, gasce); + + guest_state_exit_irqoff(); + + return ret; +} + #define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { - int rc, exit_reason; + int rc, sie_return; struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; /* @@ -5072,28 +4694,45 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) */ kvm_vcpu_srcu_read_lock(vcpu); - do { + while (true) { rc = vcpu_pre_run(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); if (rc || guestdbg_exit_pending(vcpu)) break; - kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between - * guest_enter and guest_exit should be no uaccess. + * guest_timing_enter_irqoff and guest_timing_exit_irqoff + * should be no uaccess. */ - local_irq_disable(); - guest_enter_irqoff(); - __disable_cpu_timer_accounting(vcpu); - local_irq_enable(); if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(sie_page->pv_grregs, vcpu->run->s.regs.gprs, sizeof(sie_page->pv_grregs)); } - exit_reason = sie64a(vcpu->arch.sie_block, - vcpu->run->s.regs.gprs, - vcpu->arch.gmap->asce); + +xfer_to_guest_mode_check: + local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + break; + goto xfer_to_guest_mode_check; + } + + guest_timing_enter_irqoff(); + __disable_cpu_timer_accounting(vcpu); + + sie_return = kvm_s390_enter_exit_sie(vcpu->arch.sie_block, + vcpu->run->s.regs.gprs, + vcpu->arch.gmap->asce.val); + + __enable_cpu_timer_accounting(vcpu); + guest_timing_exit_irqoff(); + local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy(vcpu->run->s.regs.gprs, sie_page->pv_grregs, @@ -5109,16 +4748,15 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; } } - local_irq_disable(); - __enable_cpu_timer_accounting(vcpu); - guest_exit_irqoff(); - local_irq_enable(); kvm_vcpu_srcu_read_lock(vcpu); - rc = vcpu_post_run(vcpu, exit_reason); - } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); + rc = vcpu_post_run(vcpu, sie_return); + if (rc || guestdbg_exit_pending(vcpu)) { + kvm_vcpu_srcu_read_unlock(vcpu); + break; + } + } - kvm_vcpu_srcu_read_unlock(vcpu); return rc; } @@ -5334,6 +4972,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (signal_pending(current) && !rc) { kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->stat.signal_exits++; rc = -EINTR; } @@ -5623,8 +5262,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | @@ -5646,32 +5285,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, acc_mode, mop->key); - goto out_inject; - } - if (acc_mode == GACC_FETCH) { + } else if (acc_mode == GACC_FETCH) { r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - if (r) - goto out_inject; - if (copy_to_user(uaddr, tmpbuf, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (!r && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); } -out_inject: if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); -out_free: - vfree(tmpbuf); return r; } @@ -5700,8 +5328,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; @@ -5861,44 +5489,58 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #ifdef CONFIG_KVM_S390_UCONTROL case KVM_S390_UCAS_MAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, - ucasmap.vcpu_addr, ucasmap.length); + r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr), + gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); break; } case KVM_S390_UCAS_UNMAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, - ucasmap.length); + gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); + r = 0; break; } #endif case KVM_S390_VCPU_FAULT: { - idx = srcu_read_lock(&vcpu->kvm->srcu); - r = vcpu_dat_fault_handler(vcpu, arg, 0); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + gpa_t gaddr = arg; + + scoped_guard(srcu, &vcpu->kvm->srcu) { + r = vcpu_ucontrol_translate(vcpu, &gaddr); + if (r) + break; + + r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false); + if (r == PGM_ADDRESSING) + r = -EFAULT; + if (r <= 0) + break; + r = -EIO; + KVM_BUG_ON(r, vcpu->kvm); + } break; } case KVM_ENABLE_CAP: @@ -6012,9 +5654,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *new, enum kvm_mr_change change) { - gpa_t size; - - if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS) + if (kvm_is_ucontrol(kvm) && new && new->id < KVM_USER_MEM_SLOTS) return -EINVAL; /* When we are protected, we should not change the memory slots */ @@ -6023,20 +5663,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { /* - * A few sanity checks. We can have memory slots which have to be - * located/ended at a segment boundary (1MB). The memory in userland is - * ok to be fragmented into various different vmas. It is okay to mmap() - * and munmap() stuff in this slot after doing this call at any time + * A few sanity checks. The memory in userland is ok to be + * fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any + * time. */ - - if (new->userspace_addr & 0xffffful) + if (new->userspace_addr & ~PAGE_MASK) return -EINVAL; - - size = new->npages * PAGE_SIZE; - if (size & 0xffffful) - return -EINVAL; - - if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit) return -EINVAL; } @@ -6064,37 +5698,89 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + struct kvm_s390_mmu_cache *mc = NULL; int rc = 0; - if (kvm_is_ucontrol(kvm)) + if (change == KVM_MR_FLAGS_ONLY) return; - switch (change) { - case KVM_MR_DELETE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - break; - case KVM_MR_MOVE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - if (rc) + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + rc = -ENOMEM; + goto out; + } + + scoped_guard(write_lock, &kvm->mmu_lock) { + switch (change) { + case KVM_MR_DELETE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); break; - fallthrough; - case KVM_MR_CREATE: - rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, - new->base_gfn * PAGE_SIZE, - new->npages * PAGE_SIZE); - break; - case KVM_MR_FLAGS_ONLY: - break; - default: - WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + case KVM_MR_MOVE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); + if (rc) + break; + fallthrough; + case KVM_MR_CREATE: + rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages); + break; + case KVM_MR_FLAGS_ONLY: + break; + default: + WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + } } +out: if (rc) pr_warn("failed to commit memory region\n"); + kvm_s390_free_mmu_cache(mc); return; } +/** + * kvm_test_age_gfn() - test young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range is young, otherwise 0. + */ +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + scoped_guard(read_lock, &kvm->mmu_lock) + return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end); +} + +/** + * kvm_age_gfn() - clear young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range was young, otherwise 0. + */ +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + scoped_guard(read_lock, &kvm->mmu_lock) + return gmap_age_gfn(kvm->arch.gmap, range->start, range->end); +} + +/** + * kvm_unmap_gfn_range() - Unmap a range of guest addresses + * @kvm: the kvm instance + * @range: the range of guest page frames to invalidate + * + * This function always returns false because every DAT table modification + * has to use the appropriate DAT table manipulation instructions, which will + * keep the TLB coherent, hence no additional TLB flush is ever required. + * + * Context: called by KVM common code with the kvm mmu write lock held + * Return: false + */ +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end); +} + static inline unsigned long nonhyp_mask(int i) { unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; @@ -6111,11 +5797,6 @@ static int __init kvm_s390_init(void) return -ENODEV; } - if (nested && hpage) { - pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); - return -EINVAL; - } - for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8d3bbb2dd8d2..dc0573b7aa4b 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -19,9 +19,19 @@ #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> +#include "dat.h" +#include "gmap.h" #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) +union kvm_s390_quad { + __uint128_t sixteen; + unsigned long eight; + unsigned int four; + unsigned short two; + unsigned char one; +}; + static inline void kvm_s390_fpu_store(struct kvm_run *run) { fpu_stfpc(&run->s.regs.fpc); @@ -106,15 +116,15 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) static inline int kvm_is_ucontrol(struct kvm *kvm) { #ifdef CONFIG_KVM_S390_UCONTROL - if (kvm->arch.gmap) - return 0; - return 1; + return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); #else return 0; #endif } -#define GUEST_PREFIX_SHIFT 13 +#define GUEST_PREFIX_SHIFT 12 +#define GUEST_PREFIX_MASK_ZARCH 0x7fffe +#define GUEST_PREFIX_MASK_ESA 0x7ffff static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) { return vcpu->arch.sie_block->prefix << GUEST_PREFIX_SHIFT; @@ -125,6 +135,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) VCPU_EVENT(vcpu, 3, "set prefix of cpu %03u to 0x%x", vcpu->vcpu_id, prefix); vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; + vcpu->arch.sie_block->prefix &= GUEST_PREFIX_MASK_ZARCH; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); } @@ -308,6 +319,9 @@ int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, u16 *rc, u16 *rrc); +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb); static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) { @@ -319,6 +333,41 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) return vcpu->arch.pv.handle; } +/** + * __kvm_s390_pv_destroy_page() - Destroy a guest page. + * @page: the page to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: must be called holding the mm lock for gmap->mm + */ +static inline int __kvm_s390_pv_destroy_page(struct page *page) +{ + struct folio *folio = page_folio(page); + int rc; + + /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */ + if (folio_test_large(folio)) + return -EFAULT; + + rc = uv_destroy_folio(folio); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_from_secure_folio(folio); + + return rc; +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -394,8 +443,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu); /* implemented in vsie.c */ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end); void kvm_s390_vsie_init(struct kvm *kvm); void kvm_s390_vsie_destroy(struct kvm *kvm); @@ -419,14 +467,10 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, unsigned long bits); -static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) -{ - return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); -} +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); @@ -528,13 +572,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); -/* support for Basic/Extended SCA handling */ -static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) -{ - struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */ - - return &sca->ipte_control; -} static inline int kvm_s390_use_sca_entries(void) { /* @@ -542,7 +579,7 @@ static inline int kvm_s390_use_sca_entries(void) * might use the entries. By not setting the entries and keeping them * invalid, hardware will not access them but intercept. */ - return sclp.has_sigpif; + return sclp.has_sigpif && sclp.has_esca; } void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, struct mcck_volatile_info *mcck_info); diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 8c40154ff50f..86d93e8dddae 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -54,7 +54,7 @@ static int zpci_setup_aipb(u8 nisc) struct page *page; int size, rc; - zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + zpci_aipb = kzalloc_obj(union zpci_sic_iib); if (!zpci_aipb) return -ENOMEM; @@ -126,8 +126,7 @@ int kvm_s390_pci_aen_init(u8 nisc) return -EPERM; mutex_lock(&aift->aift_lock); - aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), - GFP_KERNEL); + aift->kzdev = kzalloc_objs(struct kvm_zdev *, ZPCI_NR_DEVICES); if (!aift->kzdev) { rc = -ENOMEM; goto unlock; @@ -404,7 +403,7 @@ static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) { struct kvm_zdev *kzdev; - kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + kzdev = kzalloc_obj(struct kvm_zdev); if (!kzdev) return -ENOMEM; @@ -666,7 +665,7 @@ int __init kvm_s390_pci_init(void) if (!kvm_s390_pci_interp_allowed()) return 0; - aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + aift = kzalloc_obj(struct zpci_aift); if (!aift) return -ENOMEM; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 1a49b89706f8..cc0553da14cb 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -21,13 +21,14 @@ #include <asm/ebcdic.h> #include <asm/sysinfo.h> #include <asm/page-states.h> -#include <asm/gmap.h> #include <asm/ptrace.h> #include <asm/sclp.h> #include <asm/ap.h> +#include <asm/gmap_helpers.h> #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" +#include "gmap.h" static int handle_ri(struct kvm_vcpu *vcpu) { @@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) if (vcpu->arch.skey_enabled) return 0; - rc = s390_enable_skey(); + rc = gmap_enable_skeys(vcpu->arch.gmap); VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); if (rc) return rc; @@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu) static int handle_iske(struct kvm_vcpu *vcpu) { - unsigned long gaddr, vmaddr; - unsigned char key; + unsigned long gaddr; int reg1, reg2; - bool unlocked; + union skey key; int rc; vcpu->stat.instruction_iske++; @@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = get_guest_storage_key(current->mm, vmaddr, &key); - - if (rc) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; vcpu->run->s.regs.gprs[reg1] &= ~0xff; - vcpu->run->s.regs.gprs[reg1] |= key; + vcpu->run->s.regs.gprs[reg1] |= key.skey; return 0; } static int handle_rrbe(struct kvm_vcpu *vcpu) { - unsigned long vmaddr, gaddr; + unsigned long gaddr; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_rrbe++; @@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = reset_guest_reference_bit(current->mm, vmaddr); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr)); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; kvm_s390_set_psw_cc(vcpu, rc); @@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) { unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; unsigned long start, end; - unsigned char key, oldkey; + union skey key, oldkey; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_sske++; @@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { @@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - unlocked = false; - - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, - m3 & SSKE_NQ, m3 & SSKE_MR, - m3 & SSKE_MC); - - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) + if (rc > 1) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; start += PAGE_SIZE; @@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) } else { kvm_s390_set_psw_cc(vcpu, rc); vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; - vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8; } } if (m3 & SSKE_MB) { @@ -605,6 +564,14 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) } } +#if IS_ENABLED(CONFIG_VFIO_AP) +bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) +{ + return kvm_is_gpa_in_memslot(kvm, gpa); +} +EXPORT_SYMBOL_FOR_MODULES(kvm_s390_is_gpa_in_memslot, "vfio_ap"); +#endif + /* * handle_pqap: Handling pqap interception * @vcpu: the vcpu having issue the pqap instruction @@ -746,13 +713,14 @@ int is_valid_psw(psw_t *psw) int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) { psw_t *gpsw = &vcpu->arch.sie_block->gpsw; - psw_compat_t new_psw; - u64 addr; + psw32_t new_psw; + u64 addr, iaddr; int rc; u8 ar; vcpu->stat.instruction_lpsw++; + iaddr = gpsw->addr - kvm_s390_get_ilen(vcpu); if (gpsw->mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -770,18 +738,20 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE; if (!is_valid_psw(gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + vcpu->arch.sie_block->gbea = iaddr; return 0; } static int handle_lpswe(struct kvm_vcpu *vcpu) { psw_t new_psw; - u64 addr; + u64 addr, iaddr; int rc; u8 ar; vcpu->stat.instruction_lpswe++; + iaddr = vcpu->arch.sie_block->gpsw.addr - kvm_s390_get_ilen(vcpu); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -794,6 +764,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw = new_psw; if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + vcpu->arch.sie_block->gbea = iaddr; return 0; } @@ -1074,7 +1045,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; - unsigned char key; + union skey key; vcpu->stat.instruction_pfmf++; @@ -1102,7 +1073,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; - key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; + key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); @@ -1133,14 +1104,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr; - bool unlocked = false; - - /* Translate guest address to host address */ - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -1151,19 +1114,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, - key, NULL, nq, mr, mc); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, + NULL, nq, mr, mc); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc > 1) + return kvm_s390_inject_program_int(vcpu, rc); + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; } @@ -1187,8 +1148,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) { int r1, r2, nappended, entries; - unsigned long gfn, hva, res, pgstev, ptev; + union essa_state state; unsigned long *cbrlo; + unsigned long gfn; + bool dirtied; /* * We don't need to set SD.FPF.SK to 1 here, because if we have a @@ -1197,33 +1160,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) kvm_s390_get_regs_rre(vcpu, &r1, &r2); gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; - hva = gfn_to_hva(vcpu->kvm, gfn); entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; - if (kvm_is_error_hva(hva)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); - if (nappended < 0) { - res = orc ? 0x10 : 0; - vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied); + vcpu->run->s.regs.gprs[r1] = state.val; + if (nappended < 0) return 0; - } - res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; - /* - * Set the block-content state part of the result. 0 means resident, so - * nothing to do if the page is valid. 2 is for preserved pages - * (non-present and non-zero), and 3 for zero pages (non-present and - * zero). - */ - if (ptev & _PAGE_INVALID) { - res |= 2; - if (pgstev & _PGSTE_GPS_ZERO) - res |= 1; - } - if (pgstev & _PGSTE_GPS_NODAT) - res |= 0x20; - vcpu->run->s.regs.gprs[r1] = res; /* * It is possible that all the normal 511 slots were full, in which case * we will now write in the 512th slot, which is reserved for host use. @@ -1235,27 +1177,44 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) cbrlo[entries] = gfn << PAGE_SHIFT; } - if (orc) { - struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); - - /* Increment only if we are really flipping the bit */ - if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); - } + if (dirtied) + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); return nappended; } +static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int i; + + lockdep_assert_held(&vcpu->kvm->mmu_lock); + + for (i = 0; i < len; i++) { + if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce, + 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) + continue; + if (!ptep || ptep->s.pr) + continue; + pgste = pgste_get_lock(ptep); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) + gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + pgste_set_unlock(ptep, pgste); + } +} + static int handle_essa(struct kvm_vcpu *vcpu) { + lockdep_assert_held(&vcpu->kvm->srcu); + /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; - struct gmap *gmap; int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); - gmap = vcpu->arch.gmap; vcpu->stat.instruction_essa++; if (!vcpu->kvm->arch.use_cmma) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); @@ -1279,11 +1238,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) * value really needs to be written to; if the value is * already correct, we do nothing and avoid the lock. */ - if (vcpu->kvm->mm->context.uses_cmm == 0) { - mmap_write_lock(vcpu->kvm->mm); - vcpu->kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(vcpu->kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags); /* * If we are here, we are supposed to have CMMA enabled in * the SIE block. Enabling CMMA works on a per-CPU basis, @@ -1297,24 +1252,22 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* Retry the ESSA instruction */ kvm_s390_retry_instr(vcpu); } else { - int srcu_idx; - - mmap_read_lock(vcpu->kvm->mm); - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - i = __do_essa(vcpu, orc); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - mmap_read_unlock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + i = __do_essa(vcpu, orc); if (i < 0) return i; /* Account for the possible extra cbrl entry */ entries += i; } - vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ + /* reset nceo */ + vcpu->arch.sie_block->cbrlo &= PAGE_MASK; cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); - mmap_read_lock(gmap->mm); - for (i = 0; i < entries; ++i) - __gmap_zap(gmap, cbrlo[i]); - mmap_read_unlock(gmap->mm); + + mmap_read_lock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + _essa_clear_cbrl(vcpu, cbrlo, entries); + mmap_read_unlock(vcpu->kvm->mm); + return 0; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 22c012aa5206..c2dafd812a3b 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -5,19 +5,23 @@ * Copyright IBM Corp. 2019, 2020 * Author(s): Janosch Frank <frankja@linux.ibm.com> */ + +#include <linux/export.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/minmax.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> -#include <asm/gmap.h> #include <asm/uv.h> #include <asm/mman.h> #include <linux/pagewalk.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include "kvm-s390.h" +#include "dat.h" +#include "gaccess.h" #include "gmap.h" +#include "faultin.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -34,6 +38,163 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); /** + * should_export_before_import() - Determine whether an export is needed + * before an import-like operation. + * @uvcb: The Ultravisor control block of the UVC to be performed. + * @mm: The mm of the process. + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: %true if an export is needed before every import, otherwise %false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +struct pv_make_secure { + void *uvcb; + struct folio *folio; + int rc; + bool needs_export; +}; + +static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio) +{ + struct pv_make_secure *priv = f->priv; + int rc; + + if (priv->needs_export) + uv_convert_from_secure(folio_to_phys(folio)); + + if (folio_test_hugetlb(folio)) + return -EFAULT; + if (folio_test_large(folio)) + return -E2BIG; + + if (!f->page) + folio_get(folio); + rc = __make_folio_secure(folio, priv->uvcb); + if (!f->page) + folio_put(folio); + + return rc; +} + +static void _kvm_s390_pv_make_secure(struct guest_fault *f) +{ + struct pv_make_secure *priv = f->priv; + struct folio *folio; + + folio = pfn_folio(f->pfn); + priv->rc = -EAGAIN; + if (folio_trylock(folio)) { + priv->rc = __kvm_s390_pv_make_secure(f, folio); + if (priv->rc == -E2BIG || priv->rc == -EBUSY) { + priv->folio = folio; + folio_get(folio); + } + folio_unlock(folio); + } +} + +/** + * kvm_s390_pv_make_secure() - make one guest page secure + * @kvm: the guest + * @gaddr: the guest address that needs to be made secure + * @uvcb: the UVCB specifying which operation needs to be performed + * + * Context: needs to be called with kvm->srcu held. + * Return: 0 on success, < 0 in case of error. + */ +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) +{ + struct pv_make_secure priv = { .uvcb = uvcb }; + struct guest_fault f = { + .write_attempt = true, + .gfn = gpa_to_gfn(gaddr), + .callback = _kvm_s390_pv_make_secure, + .priv = &priv, + }; + int rc; + + lockdep_assert_held(&kvm->srcu); + + priv.needs_export = should_export_before_import(uvcb, kvm->mm); + + scoped_guard(mutex, &kvm->arch.pv.import_lock) { + rc = kvm_s390_faultin_gfn(NULL, kvm, &f); + + if (!rc) { + rc = priv.rc; + if (priv.folio) { + rc = s390_wiggle_split_folio(kvm->mm, priv.folio); + if (!rc) + rc = -EAGAIN; + } + } + } + if (priv.folio) + folio_put(priv.folio); + return rc; +} + +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = gaddr, + }; + + return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb); +} + +/** + * kvm_s390_pv_destroy_page() - Destroy a guest page. + * @kvm: the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: may sleep. + */ +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr) +{ + struct page *page; + int rc = 0; + + mmap_read_lock(kvm->mm); + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); + if (page) + rc = __kvm_s390_pv_destroy_page(page); + kvm_release_page_clean(page); + mmap_read_unlock(kvm->mm); + return rc; +} + +/** * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to * be destroyed * @@ -240,35 +401,6 @@ done_fast: return 0; } -/** - * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. - * @kvm: the VM whose memory is to be cleared. - * - * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. - * The CPUs of the protected VM need to be destroyed beforehand. - */ -static void kvm_s390_destroy_lower_2g(struct kvm *kvm) -{ - const unsigned long pages_2g = SZ_2G / PAGE_SIZE; - struct kvm_memory_slot *slot; - unsigned long len; - int srcu_idx; - - srcu_idx = srcu_read_lock(&kvm->srcu); - - /* Take the memslot containing guest absolute address 0 */ - slot = gfn_to_memslot(kvm, 0); - /* Clear all slots or parts thereof that are below 2GB */ - while (slot && slot->base_gfn < pages_2g) { - len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; - s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); - /* Take the next memslot */ - slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); - } - - srcu_read_unlock(&kvm->srcu, srcu_idx); -} - static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_destroy_fast uvcb = { @@ -283,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) *rc = uvcb.header.rc; if (rrc) *rrc = uvcb.header.rrc; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", uvcb.header.rc, uvcb.header.rrc); WARN_ONCE(cc && uvcb.header.rc != 0x104, @@ -332,10 +463,10 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* Guest with segment type ASCE, refuse to destroy asynchronously */ - if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT) return -EINVAL; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; @@ -345,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) priv->stor_var = kvm->arch.pv.stor_var; priv->stor_base = kvm->arch.pv.stor_base; priv->handle = kvm_s390_pv_get_handle(kvm); - priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce); if (s390_replace_asce(kvm->arch.gmap)) res = -ENOMEM; } @@ -356,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return res; } - kvm_s390_destroy_lower_2g(kvm); + gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false); kvm_s390_clear_pv_state(kvm); kvm->arch.pv.set_aside = priv; @@ -390,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -473,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) * cleanup has been performed. */ if (need_zap && mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false); mmput(kvm->mm); } @@ -511,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* When a fatal signal is received, stop immediately */ - if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true)) goto done; if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) ret = -EIO; @@ -550,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); } static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { @@ -565,6 +695,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) int cc, ret; u16 dummy; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + /* The notifier will be unregistered when the VM is destroyed */ + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + if (ret) { + kvm->arch.pv.mmu_notifier.ops = NULL; + return ret; + } + } + ret = kvm_s390_pv_alloc_vm(kvm); if (ret) return ret; @@ -572,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) /* Inputs */ uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; - uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_asce = kvm->arch.gmap->asce.val; uvcb.guest_sca = virt_to_phys(kvm->arch.sca); uvcb.conf_base_stor_origin = virt_to_phys((void *)kvm->arch.pv.stor_base); @@ -580,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); + gmap_split_huge_pages(kvm->arch.gmap); + cc = uv_call_sched(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; @@ -599,12 +743,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) } return -EIO; } - kvm->arch.gmap->guest_handle = uvcb.guest_handle; - /* Add the notifier only once. No races because we hold kvm->lock */ - if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { - kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; - mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); - } return 0; } @@ -638,27 +776,15 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[0] = tweak, .tweak[1] = offset, }; - int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); - unsigned long vmaddr; - bool unlocked; + int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; if (ret == -ENXIO) { - mmap_read_lock(kvm->mm); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); - if (kvm_is_error_hva(vmaddr)) { - ret = -EFAULT; - } else { - ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!ret) - ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); - } - mmap_read_unlock(kvm->mm); + ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true); if (!ret) return -EAGAIN; - return ret; } if (ret && ret != -EAGAIN) diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 9ac92dbf680d..9e28f165c114 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu, __entry->sie_block = sie_block; ), - TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK", + TP_printk("create cpu %d at 0x%p, sie block at 0x%p", __entry->id, __entry->vcpu, __entry->sie_block) ); @@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css, __entry->kvm = kvm; ), - TP_printk("enabling channel I/O support (kvm @ %pK)\n", + TP_printk("enabling channel I/O support (kvm @ %p)\n", __entry->kvm) ); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a78df3a4f353..e5a23f1c9749 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -15,7 +15,6 @@ #include <linux/io.h> #include <linux/mman.h> -#include <asm/gmap.h> #include <asm/mmu_context.h> #include <asm/sclp.h> #include <asm/nmi.h> @@ -42,8 +41,11 @@ struct vsie_page { * are reused conditionally, should be accessed via READ_ONCE. */ struct kvm_s390_sie_block *scb_o; /* 0x0218 */ - /* the shadow gmap in use by the vsie_page */ - struct gmap *gmap; /* 0x0220 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0220 */ /* address of the last reported fault to guest2 */ unsigned long fault_addr; /* 0x0228 */ /* calculated guest addresses of satellite control blocks */ @@ -58,16 +60,15 @@ struct vsie_page { * radix tree. */ gpa_t scb_gpa; /* 0x0258 */ - /* - * Flags: must be set/cleared atomically after the vsie page can be - * looked up by other CPUs. - */ - unsigned long flags; /* 0x0260 */ - __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap_cache gmap_cache; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; +static_assert(sizeof(struct vsie_page) == PAGE_SIZE); + /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, __u16 reason_code) @@ -124,8 +125,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; int newflags, cpuflags = atomic_read(&scb_o->cpuflags); - /* we don't allow ESA/390 guests */ - if (!(cpuflags & CPUSTAT_ZARCH)) + /* we don't allow ESA/390 guests unless explicitly enabled */ + if (!(cpuflags & CPUSTAT_ZARCH) && !vcpu->kvm->arch.allow_vsie_esamode) return set_validity_icpt(scb_s, 0x0001U); if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) @@ -134,7 +135,9 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0007U); /* intervention requests will be set later */ - newflags = CPUSTAT_ZARCH; + newflags = 0; + if (cpuflags & CPUSTAT_ZARCH) + newflags = CPUSTAT_ZARCH; if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) newflags |= CPUSTAT_GED; if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { @@ -384,6 +387,17 @@ end: return 0; } +static void shadow_esa(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + + /* Ensure these bits are indeed turned off */ + scb_s->eca &= ~ECA_VX; + scb_s->ecb &= ~(ECB_GS | ECB_TE); + scb_s->ecb3 &= ~ECB3_RI; + scb_s->ecd &= ~ECD_HOSTREGMGMT; +} + /* shadow (round up/down) the ibc to avoid validity icpt */ static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { @@ -465,7 +479,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; /* READ_ONCE does not work on bitfields - use a temporary variable */ const uint32_t __new_prefix = scb_o->prefix; - const uint32_t new_prefix = READ_ONCE(__new_prefix); + uint32_t new_prefix = READ_ONCE(__new_prefix); const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE; bool had_tx = scb_s->ecb & ECB_TE; unsigned long new_mso = 0; @@ -513,6 +527,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->icpua = scb_o->icpua; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_ZARCH)) + new_prefix &= GUEST_PREFIX_MASK_ESA; + else + new_prefix &= GUEST_PREFIX_MASK_ZARCH; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL; /* if the hva of the prefix changes, we have to remap the prefix */ @@ -587,6 +606,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->hpid = HPID_VSIE; scb_s->cpnc = scb_o->cpnc; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_ZARCH)) + shadow_esa(vcpu, vsie_page); + prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); out: @@ -595,26 +617,17 @@ out: return rc; } -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end) { - struct kvm *kvm = gmap->private; - struct vsie_page *cur; + struct vsie_page *cur, *next; unsigned long prefix; - int i; - if (!gmap_is_shadow(gmap)) - return; + KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm); /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. */ - for (i = 0; i < kvm->arch.vsie.page_count; i++) { - cur = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!cur) - continue; - if (READ_ONCE(cur->gmap) != gmap) - continue; + list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) { prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; /* with mso/msl, the prefix lies at an offset */ prefix += cur->scb_s.mso; @@ -635,7 +648,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, * - -EAGAIN if the caller can retry immediately * - -ENOMEM if out of memory */ -static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; @@ -650,10 +663,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true); if (!rc && (scb_s->ecb & ECB_TE)) - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -765,7 +777,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) rc = set_validity_icpt(scb_s, 0x0011U); else if ((gpa & PAGE_MASK) != - ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK)) rc = set_validity_icpt(scb_s, 0x003bU); if (!rc) { rc = pin_guest_page(vcpu->kvm, gpa, &hpa); @@ -934,8 +946,9 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { + bool wr = kvm_s390_cur_gmap_fault_is_write(); int rc; if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION) @@ -943,12 +956,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return inject_fault(vcpu, PGM_PROTECTION, current->thread.gmap_teid.addr * PAGE_SIZE, 1); - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_teid.addr * PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr); if (rc > 0) { rc = inject_fault(vcpu, rc, - current->thread.gmap_teid.addr * PAGE_SIZE, - kvm_s390_cur_gmap_fault_is_write()); + current->thread.gmap_teid.addr * PAGE_SIZE, wr); if (rc >= 0) vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE; } @@ -961,12 +972,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * * Will ignore any errors. The next SIE fault will do proper fault handling. */ -static void handle_last_fault(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { if (vsie_page->fault_addr) - kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr, NULL); + gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true); vsie_page->fault_addr = 0; } @@ -1048,11 +1057,12 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, } } -static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - unsigned long pei_dest, pei_src, src, dest, mask, prefix; + unsigned long src, dest, mask, prefix; u64 *pei_block = &vsie_page->scb_o->mcic; + union mvpg_pei pei_dest, pei_src; int edat, rc_dest, rc_src; union ctlreg0 cr0; @@ -1066,8 +1076,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; - rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); - rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true); + rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false); /* * Either everything went well, or something non-critical went wrong * e.g. because of a race. In either case, simply retry. @@ -1102,8 +1112,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; } if (!rc_dest && !rc_src) { - pei_block[0] = pei_dest; - pei_block[1] = pei_src; + pei_block[0] = pei_dest.val; + pei_block[1] = pei_src.val; return 1; } @@ -1127,16 +1137,17 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) __releases(vcpu->kvm->srcu) __acquires(vcpu->kvm->srcu) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + unsigned long sie_return = SIE64_RETURN_NORMAL; int guest_bp_isolation; int rc = 0; - handle_last_fault(vcpu, vsie_page); + handle_last_fault(vcpu, vsie_page, sg); kvm_vcpu_srcu_read_unlock(vcpu); @@ -1153,10 +1164,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->arch.sie_block->fpf & FPF_BPBC) set_thread_flag(TIF_ISOLATE_BP_GUEST); - local_irq_disable(); - guest_enter_irqoff(); - local_irq_enable(); - /* * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking * and VCPU requests also hinder the vSIE from running and lead @@ -1166,31 +1173,44 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; current->thread.gmap_int_code = 0; barrier(); - if (!kvm_s390_vcpu_sie_inhibited(vcpu)) - rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); + if (!kvm_s390_vcpu_sie_inhibited(vcpu)) { +xfer_to_guest_mode_check: + local_irq_disable(); + xfer_to_guest_mode_prepare(); + if (xfer_to_guest_mode_work_pending()) { + local_irq_enable(); + rc = kvm_xfer_to_guest_mode_handle_work(vcpu); + if (rc) + goto skip_sie; + goto xfer_to_guest_mode_check; + } + guest_timing_enter_irqoff(); + sie_return = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val); + guest_timing_exit_irqoff(); + local_irq_enable(); + } + +skip_sie: barrier(); vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; - local_irq_disable(); - guest_exit_irqoff(); - local_irq_enable(); - /* restore guest state for bp isolation override */ if (!guest_bp_isolation) clear_thread_flag(TIF_ISOLATE_BP_GUEST); kvm_vcpu_srcu_read_lock(vcpu); - if (rc == -EINTR) { - VCPU_EVENT(vcpu, 3, "%s", "machine check"); + if (sie_return == SIE64_RETURN_MCCK) { kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info); return 0; } + WARN_ON_ONCE(sie_return != SIE64_RETURN_NORMAL); + if (rc > 0) rc = 0; /* we could still have an icpt */ else if (current->thread.gmap_int_code) - return handle_fault(vcpu, vsie_page); + return handle_fault(vcpu, vsie_page, sg); switch (scb_s->icptcode) { case ICPT_INST: @@ -1208,7 +1228,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) break; case ICPT_PARTEXEC: if (scb_s->ipa == 0xb254) - rc = vsie_handle_mvpg(vcpu, vsie_page); + rc = vsie_handle_mvpg(vcpu, vsie_page, sg); break; } return rc; @@ -1216,43 +1236,67 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) static void release_gmap_shadow(struct vsie_page *vsie_page) { - if (vsie_page->gmap) - gmap_put(vsie_page->gmap); - WRITE_ONCE(vsie_page->gmap, NULL); + struct gmap *gmap = vsie_page->gmap_cache.gmap; + + lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock); + + list_del(&vsie_page->gmap_cache.list); + vsie_page->gmap_cache.gmap = NULL; prefix_unmapped(vsie_page); + + if (list_empty(&gmap->scb_users)) { + gmap_remove_child(gmap); + gmap_put(gmap); + } } -static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { - unsigned long asce; union ctlreg0 cr0; struct gmap *gmap; + union asce asce; int edat; - asce = vcpu->arch.sie_block->gcr[1]; + asce.val = vcpu->arch.sie_block->gcr[1]; cr0.val = vcpu->arch.sie_block->gcr[0]; edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); edat += edat && test_kvm_facility(vcpu->kvm, 78); - /* - * ASCE or EDAT could have changed since last icpt, or the gmap - * we're holding has been unshadowed. If the gmap is still valid, - * we can safely reuse it. - */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { - vcpu->kvm->stat.gmap_shadow_reuse++; - return 0; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + gmap = vsie_page->gmap_cache.gmap; + if (gmap) { + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (gmap_is_shadow_valid(gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; + gmap_get(gmap); + return gmap; + } + /* release the old shadow and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + } } - - /* release the old shadow - if any, and mark the prefix as unmapped */ - release_gmap_shadow(vsie_page); - gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); +again: + gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat); if (IS_ERR(gmap)) - return PTR_ERR(gmap); - vcpu->kvm->stat.gmap_shadow_create++; - WRITE_ONCE(vsie_page->gmap, gmap); - return 0; + return gmap; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + /* unlikely race condition, remove the previous shadow */ + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + if (!gmap->parent) { + gmap_put(gmap); + goto again; + } + vcpu->kvm->stat.gmap_shadow_create++; + list_add(&vsie_page->gmap_cache.list, &gmap->scb_users); + vsie_page->gmap_cache.gmap = gmap; + prefix_unmapped(vsie_page); + } + return gmap; } /* @@ -1305,15 +1349,20 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct gmap *sg = NULL; int rc = 0; while (1) { - rc = acquire_gmap_shadow(vcpu, vsie_page); + sg = acquire_gmap_shadow(vcpu, vsie_page); + if (IS_ERR(sg)) { + rc = PTR_ERR(sg); + sg = NULL; + } if (!rc) - rc = map_prefix(vcpu, vsie_page); + rc = map_prefix(vcpu, vsie_page, sg); if (!rc) { update_intervention_requests(vsie_page); - rc = do_vsie_run(vcpu, vsie_page); + rc = do_vsie_run(vcpu, vsie_page, sg); } atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); @@ -1331,14 +1380,17 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * but rewind the PSW to re-enter SIE once that's completed * instead of passing a "no action" intercept to the guest. */ - if (signal_pending(current) || - kvm_s390_vcpu_has_irq(vcpu, 0) || + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); break; } + if (sg) + sg = gmap_put(sg); cond_resched(); } + if (sg) + sg = gmap_put(sg); if (rc == -EFAULT) { /* @@ -1434,8 +1486,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page->scb_gpa = ULONG_MAX; /* Double use of the same address or allocation failure. */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, - vsie_page)) { + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) { put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; @@ -1444,7 +1495,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_unlock(&kvm->arch.vsie.mutex); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); - release_gmap_shadow(vsie_page); + if (vsie_page->gmap_cache.gmap) { + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + } + prefix_unmapped(vsie_page); vsie_page->fault_addr = 0; vsie_page->scb_s.ihcpu = 0xffffU; return vsie_page; @@ -1469,18 +1525,19 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (unlikely(scb_addr & 0x1ffUL)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || - kvm_s390_vcpu_sie_inhibited(vcpu)) { + if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) { kvm_s390_rewind_psw(vcpu, 4); return 0; } vsie_page = get_vsie_page(vcpu->kvm, scb_addr); - if (IS_ERR(vsie_page)) + if (IS_ERR(vsie_page)) { return PTR_ERR(vsie_page); - else if (!vsie_page) + } else if (!vsie_page) { /* double use of sie control block - simply do nothing */ + kvm_s390_rewind_psw(vcpu, 4); return 0; + } rc = pin_scb(vcpu, vsie_page, scb_addr); if (rc) @@ -1521,8 +1578,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { vsie_page = kvm->arch.vsie.pages[i]; + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); kvm->arch.vsie.pages[i] = NULL; - release_gmap_shadow(vsie_page); /* free the radix tree entry */ if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, |
