From 75ea44e356b5de8c817f821c9dd68ae329e82add Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 25 May 2021 18:07:58 +0200
Subject: perf jevents: Fix getting maximum number of fds

On some hosts, rlim.rlim_max can be returned as RLIM_INFINITY.
By casting it to int, it is interpreted as -1, which will cause get_maxfds
to return 0, causing "Invalid argument" errors in nftw() calls.
Fix this by casting the second argument of min() to rlim_t instead.

Fixes: 80eeb67fe577 ("perf jevents: Program to convert JSON file")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lore.kernel.org/lkml/20210525160758.97829-1-nbd@nbd.name
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/jevents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index 7422b0ea8790..9604446f8360 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -960,7 +960,7 @@ static int get_maxfds(void)
 	struct rlimit rlim;
 
 	if (getrlimit(RLIMIT_NOFILE, &rlim) == 0)
-		return min((int)rlim.rlim_max / 2, 512);
+		return min(rlim.rlim_max / 2, (rlim_t)512);
 
 	return 512;
 }
-- 
cgit v1.2.3


From 39fe2fc96694164723846fccf6caa42c3aee6ec4 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Wed, 12 May 2021 12:31:06 +0800
Subject: selftests: kvm: make allocation of extra memory take effect

The extra memory pages is missed to be allocated during VM creating.
perf_test_util and kvm_page_table_test use it to alloc extra memory
currently.

Fix it by adding extra_mem_pages to the total memory calculation before
allocate.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Message-Id: <20210512043107.30076-1-zhenzhong.duan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index fc83f6c5902d..159f4d62241d 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -295,7 +295,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
 	 */
 	uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
 	uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
-	uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages;
+	uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages;
 	struct kvm_vm *vm;
 	int i;
 
-- 
cgit v1.2.3


From a13534d6676d2f2a9aa286e27e482b4896ff90e3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 24 May 2021 14:27:38 +0200
Subject: selftests: kvm: fix potential issue with ELF loading

vm_vaddr_alloc() sets up GVA to GPA mapping page by page; therefore, GPAs
may not be continuous if same memslot is used for data and page table allocation.

kvm_vm_elf_load() however expects a continuous range of HVAs (and thus GPAs)
because it does not try to read file data page by page.  Fix this mismatch
by allocating memory in one step.

Reported-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 159f4d62241d..12d953d8ee35 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1099,6 +1099,9 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
 	uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
 
 	virt_pgd_alloc(vm, pgd_memslot);
+	vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
+					      KVM_UTIL_MIN_PFN * vm->page_size,
+					      data_memslot);
 
 	/*
 	 * Find an unused range of virtual page addresses of at least
@@ -1108,11 +1111,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
 
 	/* Map the virtual pages. */
 	for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
-		pages--, vaddr += vm->page_size) {
-		vm_paddr_t paddr;
-
-		paddr = vm_phy_page_alloc(vm,
-				KVM_UTIL_MIN_PFN * vm->page_size, data_memslot);
+		pages--, vaddr += vm->page_size, paddr += vm->page_size) {
 
 		virt_pg_map(vm, vaddr, paddr, pgd_memslot);
 
-- 
cgit v1.2.3


From 22721a56109940f15b673d0f01907b7a7202275e Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Tue, 13 Apr 2021 16:08:27 +0200
Subject: KVM: selftests: Keep track of memslots more efficiently

The KVM selftest framework was using a simple list for keeping track of
the memslots currently in use.
This resulted in lookups and adding a single memslot being O(n), the
later due to linear scanning of the existing memslot set to check for
the presence of any conflicting entries.

Before this change, benchmarking high count of memslots was more or less
impossible as pretty much all the benchmark time was spent in the
selftest framework code.

We can simply use a rbtree for keeping track of both of gfn and hva.
We don't need an interval tree for hva here as we can't have overlapping
memslots because we allocate a completely new memory chunk for each new
memslot.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Message-Id: <b12749d47ee860468240cf027412c91b76dbe3db.1618253574.git.maciej.szmigiero@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile               |   2 +-
 tools/testing/selftests/kvm/lib/kvm_util.c         | 141 ++++++++++++++++-----
 .../testing/selftests/kvm/lib/kvm_util_internal.h  |  15 ++-
 tools/testing/selftests/kvm/lib/rbtree.c           |   1 +
 4 files changed, 124 insertions(+), 35 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/lib/rbtree.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index e439d027939d..a8c30f888d40 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -33,7 +33,7 @@ ifeq ($(ARCH),s390)
 	UNAME_M := s390x
 endif
 
-LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c
 LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
 LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 12d953d8ee35..1255744758e3 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -203,7 +203,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 	TEST_ASSERT(vm != NULL, "Insufficient Memory");
 
 	INIT_LIST_HEAD(&vm->vcpus);
-	INIT_LIST_HEAD(&vm->userspace_mem_regions);
+	vm->regions.gpa_tree = RB_ROOT;
+	vm->regions.hva_tree = RB_ROOT;
+	hash_init(vm->regions.slot_hash);
 
 	vm->mode = mode;
 	vm->type = 0;
@@ -355,13 +357,14 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
  */
 void kvm_vm_restart(struct kvm_vm *vmp, int perm)
 {
+	int ctr;
 	struct userspace_mem_region *region;
 
 	vm_open(vmp, perm);
 	if (vmp->has_irqchip)
 		vm_create_irqchip(vmp);
 
-	list_for_each_entry(region, &vmp->userspace_mem_regions, list) {
+	hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
 		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
 		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
 			    "  rc: %i errno: %i\n"
@@ -424,14 +427,21 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm)
 static struct userspace_mem_region *
 userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
 {
-	struct userspace_mem_region *region;
+	struct rb_node *node;
 
-	list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+	for (node = vm->regions.gpa_tree.rb_node; node; ) {
+		struct userspace_mem_region *region =
+			container_of(node, struct userspace_mem_region, gpa_node);
 		uint64_t existing_start = region->region.guest_phys_addr;
 		uint64_t existing_end = region->region.guest_phys_addr
 			+ region->region.memory_size - 1;
 		if (start <= existing_end && end >= existing_start)
 			return region;
+
+		if (start < existing_start)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
 	}
 
 	return NULL;
@@ -546,11 +556,16 @@ void kvm_vm_release(struct kvm_vm *vmp)
 }
 
 static void __vm_mem_region_delete(struct kvm_vm *vm,
-				   struct userspace_mem_region *region)
+				   struct userspace_mem_region *region,
+				   bool unlink)
 {
 	int ret;
 
-	list_del(&region->list);
+	if (unlink) {
+		rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
+		rb_erase(&region->hva_node, &vm->regions.hva_tree);
+		hash_del(&region->slot_node);
+	}
 
 	region->region.memory_size = 0;
 	ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
@@ -569,14 +584,16 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
  */
 void kvm_vm_free(struct kvm_vm *vmp)
 {
-	struct userspace_mem_region *region, *tmp;
+	int ctr;
+	struct hlist_node *node;
+	struct userspace_mem_region *region;
 
 	if (vmp == NULL)
 		return;
 
 	/* Free userspace_mem_regions. */
-	list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list)
-		__vm_mem_region_delete(vmp, region);
+	hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
+		__vm_mem_region_delete(vmp, region, false);
 
 	/* Free sparsebit arrays. */
 	sparsebit_free(&vmp->vpages_valid);
@@ -658,6 +675,57 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
 	return 0;
 }
 
+static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
+					       struct userspace_mem_region *region)
+{
+	struct rb_node **cur, *parent;
+
+	for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
+		struct userspace_mem_region *cregion;
+
+		cregion = container_of(*cur, typeof(*cregion), gpa_node);
+		parent = *cur;
+		if (region->region.guest_phys_addr <
+		    cregion->region.guest_phys_addr)
+			cur = &(*cur)->rb_left;
+		else {
+			TEST_ASSERT(region->region.guest_phys_addr !=
+				    cregion->region.guest_phys_addr,
+				    "Duplicate GPA in region tree");
+
+			cur = &(*cur)->rb_right;
+		}
+	}
+
+	rb_link_node(&region->gpa_node, parent, cur);
+	rb_insert_color(&region->gpa_node, gpa_tree);
+}
+
+static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
+					       struct userspace_mem_region *region)
+{
+	struct rb_node **cur, *parent;
+
+	for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
+		struct userspace_mem_region *cregion;
+
+		cregion = container_of(*cur, typeof(*cregion), hva_node);
+		parent = *cur;
+		if (region->host_mem < cregion->host_mem)
+			cur = &(*cur)->rb_left;
+		else {
+			TEST_ASSERT(region->host_mem !=
+				    cregion->host_mem,
+				    "Duplicate HVA in region tree");
+
+			cur = &(*cur)->rb_right;
+		}
+	}
+
+	rb_link_node(&region->hva_node, parent, cur);
+	rb_insert_color(&region->hva_node, hva_tree);
+}
+
 /*
  * VM Userspace Memory Region Add
  *
@@ -722,7 +790,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 			(uint64_t) region->region.memory_size);
 
 	/* Confirm no region with the requested slot already exists. */
-	list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
+			       slot) {
 		if (region->region.slot != slot)
 			continue;
 
@@ -793,8 +862,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 		ret, errno, slot, flags,
 		guest_paddr, (uint64_t) region->region.memory_size);
 
-	/* Add to linked-list of memory regions. */
-	list_add(&region->list, &vm->userspace_mem_regions);
+	/* Add to quick lookup data structures */
+	vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
+	vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
+	hash_add(vm->regions.slot_hash, &region->slot_node, slot);
 }
 
 /*
@@ -817,10 +888,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot)
 {
 	struct userspace_mem_region *region;
 
-	list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
+			       memslot)
 		if (region->region.slot == memslot)
 			return region;
-	}
 
 	fprintf(stderr, "No mem region with the requested slot found,\n"
 		"  requested slot: %u\n", memslot);
@@ -905,7 +976,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
  */
 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
 {
-	__vm_mem_region_delete(vm, memslot2region(vm, slot));
+	__vm_mem_region_delete(vm, memslot2region(vm, slot), true);
 }
 
 /*
@@ -1176,16 +1247,14 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
 {
 	struct userspace_mem_region *region;
 
-	list_for_each_entry(region, &vm->userspace_mem_regions, list) {
-		if ((gpa >= region->region.guest_phys_addr)
-			&& (gpa <= (region->region.guest_phys_addr
-				+ region->region.memory_size - 1)))
-			return (void *) ((uintptr_t) region->host_mem
-				+ (gpa - region->region.guest_phys_addr));
+	region = userspace_mem_region_find(vm, gpa, gpa);
+	if (!region) {
+		TEST_FAIL("No vm physical memory at 0x%lx", gpa);
+		return NULL;
 	}
 
-	TEST_FAIL("No vm physical memory at 0x%lx", gpa);
-	return NULL;
+	return (void *)((uintptr_t)region->host_mem
+		+ (gpa - region->region.guest_phys_addr));
 }
 
 /*
@@ -1207,15 +1276,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
  */
 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
 {
-	struct userspace_mem_region *region;
+	struct rb_node *node;
+
+	for (node = vm->regions.hva_tree.rb_node; node; ) {
+		struct userspace_mem_region *region =
+			container_of(node, struct userspace_mem_region, hva_node);
+
+		if (hva >= region->host_mem) {
+			if (hva <= (region->host_mem
+				+ region->region.memory_size - 1))
+				return (vm_paddr_t)((uintptr_t)
+					region->region.guest_phys_addr
+					+ (hva - (uintptr_t)region->host_mem));
 
-	list_for_each_entry(region, &vm->userspace_mem_regions, list) {
-		if ((hva >= region->host_mem)
-			&& (hva <= (region->host_mem
-				+ region->region.memory_size - 1)))
-			return (vm_paddr_t) ((uintptr_t)
-				region->region.guest_phys_addr
-				+ (hva - (uintptr_t) region->host_mem));
+			node = node->rb_right;
+		} else
+			node = node->rb_left;
 	}
 
 	TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
@@ -1821,6 +1897,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
  */
 void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
+	int ctr;
 	struct userspace_mem_region *region;
 	struct vcpu *vcpu;
 
@@ -1828,7 +1905,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
 	fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
 	fprintf(stream, "%*sMem Regions:\n", indent, "");
-	list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
 		fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
 			"host_virt: %p\n", indent + 2, "",
 			(uint64_t) region->region.guest_phys_addr,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index 91ce1b5d480b..b30e8c7b119b 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -8,6 +8,9 @@
 #ifndef SELFTEST_KVM_UTIL_INTERNAL_H
 #define SELFTEST_KVM_UTIL_INTERNAL_H
 
+#include "linux/hashtable.h"
+#include "linux/rbtree.h"
+
 #include "sparsebit.h"
 
 struct userspace_mem_region {
@@ -18,7 +21,9 @@ struct userspace_mem_region {
 	void *host_mem;
 	void *mmap_start;
 	size_t mmap_size;
-	struct list_head list;
+	struct rb_node gpa_node;
+	struct rb_node hva_node;
+	struct hlist_node slot_node;
 };
 
 struct vcpu {
@@ -31,6 +36,12 @@ struct vcpu {
 	uint32_t dirty_gfns_count;
 };
 
+struct userspace_mem_regions {
+	struct rb_root gpa_tree;
+	struct rb_root hva_tree;
+	DECLARE_HASHTABLE(slot_hash, 9);
+};
+
 struct kvm_vm {
 	int mode;
 	unsigned long type;
@@ -43,7 +54,7 @@ struct kvm_vm {
 	unsigned int va_bits;
 	uint64_t max_gfn;
 	struct list_head vcpus;
-	struct list_head userspace_mem_regions;
+	struct userspace_mem_regions regions;
 	struct sparsebit *vpages_valid;
 	struct sparsebit *vpages_mapped;
 	bool has_irqchip;
diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c
new file mode 100644
index 000000000000..a703f0194ea3
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/rbtree.c
@@ -0,0 +1 @@
+#include "../../../../lib/rbtree.c"
-- 
cgit v1.2.3


From cad347fab142bcb9bebc125b5ba0c1e52ce74fdc Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Tue, 13 Apr 2021 16:08:28 +0200
Subject: KVM: selftests: add a memslot-related performance benchmark

This benchmark contains the following tests:
* Map test, where the host unmaps guest memory while the guest writes to
it (maps it).

The test is designed in a way to make the unmap operation on the host
take a negligible amount of time in comparison with the mapping
operation in the guest.

The test area is actually split in two: the first half is being mapped
by the guest while the second half in being unmapped by the host.
Then a guest <-> host sync happens and the areas are reversed.

* Unmap test which is broadly similar to the above map test, but it is
designed in an opposite way: to make the mapping operation in the guest
take a negligible amount of time in comparison with the unmap operation
on the host.
This test is available in two variants: with per-page unmap operation
or a chunked one (using 2 MiB chunk size).

* Move active area test which involves moving the last (highest gfn)
memslot a bit back and forth on the host while the guest is
concurrently writing around the area being moved (including over the
moved memslot).

* Move inactive area test which is similar to the previous move active
area test, but now guest writes all happen outside of the area being
moved.

* Read / write test in which the guest writes to the beginning of each
page of the test area while the host writes to the middle of each such
page.
Then each side checks the values the other side has written.
This particular test is not expected to give different results depending
on particular memslots implementation, it is meant as a rough sanity
check and to provide insight on the spread of test results expected.

Each test performs its operation in a loop until a test period ends
(this is 5 seconds by default, but it is configurable).
Then the total count of loops done is divided by the actual elapsed
time to give the test result.

The tests have a configurable memslot cap with the "-s" test option, by
default the system maximum is used.
Each test is repeated a particular number of times (by default 20
times), the best result achieved is printed.

The test memory area is divided equally between memslots, the reminder
is added to the last memslot.
The test area size does not depend on the number of memslots in use.

The tests also measure the time that it took to add all these memslots.
The best result from the tests that use the whole test area is printed
after all the requested tests are done.

In general, these tests are designed to use as much memory as possible
(within reason) while still doing 100+ loops even on high memslot counts
with the default test length.
Increasing the test runtime makes it increasingly more likely that some
event will happen on the system during the test run, which might lower
the test result.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Message-Id: <8d31bb3d92bc8fa33a9756fa802ee14266ab994e.1618253574.git.maciej.szmigiero@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/.gitignore          |    1 +
 tools/testing/selftests/kvm/Makefile            |    1 +
 tools/testing/selftests/kvm/memslot_perf_test.c | 1037 +++++++++++++++++++++++
 3 files changed, 1039 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/memslot_perf_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index bd83158e0e0b..524c857a049c 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -41,5 +41,6 @@
 /kvm_create_max_vcpus
 /kvm_page_table_test
 /memslot_modification_stress_test
+/memslot_perf_test
 /set_memory_region_test
 /steal_time
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index a8c30f888d40..daaee1888b12 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -74,6 +74,7 @@ TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
+TEST_GEN_PROGS_x86_64 += memslot_perf_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
new file mode 100644
index 000000000000..4ae0e5ec0f74
--- /dev/null
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -0,0 +1,1037 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A memslot-related performance benchmark.
+ *
+ * Copyright (C) 2021 Oracle and/or its affiliates.
+ *
+ * Basic guest setup / host vCPU thread code lifted from set_memory_region_test.
+ */
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+#define MEM_SIZE		((512U << 20) + 4096)
+#define MEM_SIZE_PAGES		(MEM_SIZE / 4096)
+#define MEM_GPA		0x10000000UL
+#define MEM_AUX_GPA		MEM_GPA
+#define MEM_SYNC_GPA		MEM_AUX_GPA
+#define MEM_TEST_GPA		(MEM_AUX_GPA + 4096)
+#define MEM_TEST_SIZE		(MEM_SIZE - 4096)
+static_assert(MEM_SIZE % 4096 == 0, "invalid mem size");
+static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size");
+
+/*
+ * 32 MiB is max size that gets well over 100 iterations on 509 slots.
+ * Considering that each slot needs to have at least one page up to
+ * 8194 slots in use can then be tested (although with slightly
+ * limited resolution).
+ */
+#define MEM_SIZE_MAP		((32U << 20) + 4096)
+#define MEM_SIZE_MAP_PAGES	(MEM_SIZE_MAP / 4096)
+#define MEM_TEST_MAP_SIZE	(MEM_SIZE_MAP - 4096)
+#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096)
+static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size");
+static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size");
+static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size");
+static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size");
+
+/*
+ * 128 MiB is min size that fills 32k slots with at least one page in each
+ * while at the same time gets 100+ iterations in such test
+ */
+#define MEM_TEST_UNMAP_SIZE		(128U << 20)
+#define MEM_TEST_UNMAP_SIZE_PAGES	(MEM_TEST_UNMAP_SIZE / 4096)
+/* 2 MiB chunk size like a typical huge page */
+#define MEM_TEST_UNMAP_CHUNK_PAGES	(2U << (20 - 12))
+static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE,
+	      "invalid unmap test region size");
+static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0,
+	      "invalid unmap test region size");
+static_assert(MEM_TEST_UNMAP_SIZE_PAGES %
+	      (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0,
+	      "invalid unmap test region size");
+
+/*
+ * For the move active test the middle of the test area is placed on
+ * a memslot boundary: half lies in the memslot being moved, half in
+ * other memslot(s).
+ *
+ * When running this test with 32k memslots (32764, really) each memslot
+ * contains 4 pages.
+ * The last one additionally contains the remaining 21 pages of memory,
+ * for the total size of 25 pages.
+ * Hence, the maximum size here is 50 pages.
+ */
+#define MEM_TEST_MOVE_SIZE_PAGES	(50)
+#define MEM_TEST_MOVE_SIZE		(MEM_TEST_MOVE_SIZE_PAGES * 4096)
+#define MEM_TEST_MOVE_GPA_DEST		(MEM_GPA + MEM_SIZE)
+static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE,
+	      "invalid move test region size");
+
+#define MEM_TEST_VAL_1 0x1122334455667788
+#define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00
+
+struct vm_data {
+	struct kvm_vm *vm;
+	pthread_t vcpu_thread;
+	uint32_t nslots;
+	uint64_t npages;
+	uint64_t pages_per_slot;
+	void **hva_slots;
+	bool mmio_ok;
+	uint64_t mmio_gpa_min;
+	uint64_t mmio_gpa_max;
+};
+
+struct sync_area {
+	atomic_bool start_flag;
+	atomic_bool exit_flag;
+	atomic_bool sync_flag;
+	void *move_area_ptr;
+};
+
+/*
+ * Technically, we need also for the atomic bool to be address-free, which
+ * is recommended, but not strictly required, by C11 for lockless
+ * implementations.
+ * However, in practice both GCC and Clang fulfill this requirement on
+ * all KVM-supported platforms.
+ */
+static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless");
+
+static sem_t vcpu_ready;
+
+static bool map_unmap_verify;
+
+static bool verbose;
+#define pr_info_v(...)				\
+	do {					\
+		if (verbose)			\
+			pr_info(__VA_ARGS__);	\
+	} while (0)
+
+static void *vcpu_worker(void *data)
+{
+	struct vm_data *vm = data;
+	struct kvm_run *run;
+	struct ucall uc;
+	uint64_t cmd;
+
+	run = vcpu_state(vm->vm, VCPU_ID);
+	while (1) {
+		vcpu_run(vm->vm, VCPU_ID);
+
+		if (run->exit_reason == KVM_EXIT_IO) {
+			cmd = get_ucall(vm->vm, VCPU_ID, &uc);
+			if (cmd != UCALL_SYNC)
+				break;
+
+			sem_post(&vcpu_ready);
+			continue;
+		}
+
+		if (run->exit_reason != KVM_EXIT_MMIO)
+			break;
+
+		TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit");
+		TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read");
+		TEST_ASSERT(run->mmio.len == 8,
+			    "Unexpected exit mmio size = %u", run->mmio.len);
+		TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min &&
+			    run->mmio.phys_addr <= vm->mmio_gpa_max,
+			    "Unexpected exit mmio address = 0x%llx",
+			    run->mmio.phys_addr);
+	}
+
+	if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT)
+		TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0],
+			  __FILE__, uc.args[1], uc.args[2]);
+
+	return NULL;
+}
+
+static void wait_for_vcpu(void)
+{
+	struct timespec ts;
+
+	TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts),
+		    "clock_gettime() failed: %d\n", errno);
+
+	ts.tv_sec += 2;
+	TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts),
+		    "sem_timedwait() failed: %d\n", errno);
+}
+
+static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages)
+{
+	uint64_t gpage, pgoffs;
+	uint32_t slot, slotoffs;
+	void *base;
+
+	TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate");
+	TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096,
+		    "Too high gpa to translate");
+	gpa -= MEM_GPA;
+
+	gpage = gpa / 4096;
+	pgoffs = gpa % 4096;
+	slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1);
+	slotoffs = gpage - (slot * data->pages_per_slot);
+
+	if (rempages) {
+		uint64_t slotpages;
+
+		if (slot == data->nslots - 1)
+			slotpages = data->npages - slot * data->pages_per_slot;
+		else
+			slotpages = data->pages_per_slot;
+
+		TEST_ASSERT(!pgoffs,
+			    "Asking for remaining pages in slot but gpa not page aligned");
+		*rempages = slotpages - slotoffs;
+	}
+
+	base = data->hva_slots[slot];
+	return (uint8_t *)base + slotoffs * 4096 + pgoffs;
+}
+
+static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot)
+{
+	TEST_ASSERT(slot < data->nslots, "Too high slot number");
+
+	return MEM_GPA + slot * data->pages_per_slot * 4096;
+}
+
+static struct vm_data *alloc_vm(void)
+{
+	struct vm_data *data;
+
+	data = malloc(sizeof(*data));
+	TEST_ASSERT(data, "malloc(vmdata) failed");
+
+	data->vm = NULL;
+	data->hva_slots = NULL;
+
+	return data;
+}
+
+static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
+		       void *guest_code, uint64_t mempages,
+		       struct timespec *slot_runtime)
+{
+	uint32_t max_mem_slots;
+	uint64_t rempages;
+	uint64_t guest_addr;
+	uint32_t slot;
+	struct timespec tstart;
+	struct sync_area *sync;
+
+	max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+	TEST_ASSERT(max_mem_slots > 1,
+		    "KVM_CAP_NR_MEMSLOTS should be greater than 1");
+	TEST_ASSERT(nslots > 1 || nslots == -1,
+		    "Slot count cap should be greater than 1");
+	if (nslots != -1)
+		max_mem_slots = min(max_mem_slots, (uint32_t)nslots);
+	pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots);
+
+	TEST_ASSERT(mempages > 1,
+		    "Can't test without any memory");
+
+	data->npages = mempages;
+	data->nslots = max_mem_slots - 1;
+	data->pages_per_slot = mempages / data->nslots;
+	if (!data->pages_per_slot) {
+		*maxslots = mempages + 1;
+		return false;
+	}
+
+	rempages = mempages % data->nslots;
+	data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
+	TEST_ASSERT(data->hva_slots, "malloc() fail");
+
+	data->vm = vm_create_default(VCPU_ID, mempages, guest_code);
+
+	pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
+		max_mem_slots - 1, data->pages_per_slot, rempages);
+
+	clock_gettime(CLOCK_MONOTONIC, &tstart);
+	for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) {
+		uint64_t npages;
+
+		npages = data->pages_per_slot;
+		if (slot == max_mem_slots - 1)
+			npages += rempages;
+
+		vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS,
+					    guest_addr, slot, npages,
+					    0);
+		guest_addr += npages * 4096;
+	}
+	*slot_runtime = timespec_elapsed(tstart);
+
+	for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) {
+		uint64_t npages;
+		uint64_t gpa;
+
+		npages = data->pages_per_slot;
+		if (slot == max_mem_slots - 2)
+			npages += rempages;
+
+		gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr,
+					 slot + 1);
+		TEST_ASSERT(gpa == guest_addr,
+			    "vm_phy_pages_alloc() failed\n");
+
+		data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr);
+		memset(data->hva_slots[slot], 0, npages * 4096);
+
+		guest_addr += npages * 4096;
+	}
+
+	virt_map(data->vm, MEM_GPA, MEM_GPA, mempages, 0);
+
+	sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
+	atomic_init(&sync->start_flag, false);
+	atomic_init(&sync->exit_flag, false);
+	atomic_init(&sync->sync_flag, false);
+
+	data->mmio_ok = false;
+
+	return true;
+}
+
+static void launch_vm(struct vm_data *data)
+{
+	pr_info_v("Launching the test VM\n");
+
+	pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data);
+
+	/* Ensure the guest thread is spun up. */
+	wait_for_vcpu();
+}
+
+static void free_vm(struct vm_data *data)
+{
+	kvm_vm_free(data->vm);
+	free(data->hva_slots);
+	free(data);
+}
+
+static void wait_guest_exit(struct vm_data *data)
+{
+	pthread_join(data->vcpu_thread, NULL);
+}
+
+static void let_guest_run(struct sync_area *sync)
+{
+	atomic_store_explicit(&sync->start_flag, true, memory_order_release);
+}
+
+static void guest_spin_until_start(void)
+{
+	struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+	while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire))
+		;
+}
+
+static void make_guest_exit(struct sync_area *sync)
+{
+	atomic_store_explicit(&sync->exit_flag, true, memory_order_release);
+}
+
+static bool _guest_should_exit(void)
+{
+	struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+	return atomic_load_explicit(&sync->exit_flag, memory_order_acquire);
+}
+
+#define guest_should_exit() unlikely(_guest_should_exit())
+
+/*
+ * noinline so we can easily see how much time the host spends waiting
+ * for the guest.
+ * For the same reason use alarm() instead of polling clock_gettime()
+ * to implement a wait timeout.
+ */
+static noinline void host_perform_sync(struct sync_area *sync)
+{
+	alarm(2);
+
+	atomic_store_explicit(&sync->sync_flag, true, memory_order_release);
+	while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire))
+		;
+
+	alarm(0);
+}
+
+static bool guest_perform_sync(void)
+{
+	struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+	bool expected;
+
+	do {
+		if (guest_should_exit())
+			return false;
+
+		expected = true;
+	} while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag,
+							&expected, false,
+							memory_order_acq_rel,
+							memory_order_relaxed));
+
+	return true;
+}
+
+static void guest_code_test_memslot_move(void)
+{
+	struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+	uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr);
+
+	GUEST_SYNC(0);
+
+	guest_spin_until_start();
+
+	while (!guest_should_exit()) {
+		uintptr_t ptr;
+
+		for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE;
+		     ptr += 4096)
+			*(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+		/*
+		 * No host sync here since the MMIO exits are so expensive
+		 * that the host would spend most of its time waiting for
+		 * the guest and so instead of measuring memslot move
+		 * performance we would measure the performance and
+		 * likelihood of MMIO exits
+		 */
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_code_test_memslot_map(void)
+{
+	struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+	GUEST_SYNC(0);
+
+	guest_spin_until_start();
+
+	while (1) {
+		uintptr_t ptr;
+
+		for (ptr = MEM_TEST_GPA;
+		     ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096)
+			*(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+		if (!guest_perform_sync())
+			break;
+
+		for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2;
+		     ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096)
+			*(uint64_t *)ptr = MEM_TEST_VAL_2;
+
+		if (!guest_perform_sync())
+			break;
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_code_test_memslot_unmap(void)
+{
+	struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA;
+
+	GUEST_SYNC(0);
+
+	guest_spin_until_start();
+
+	while (1) {
+		uintptr_t ptr = MEM_TEST_GPA;
+
+		/*
+		 * We can afford to access (map) just a small number of pages
+		 * per host sync as otherwise the host will spend
+		 * a significant amount of its time waiting for the guest
+		 * (instead of doing unmap operations), so this will
+		 * effectively turn this test into a map performance test.
+		 *
+		 * Just access a single page to be on the safe side.
+		 */
+		*(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+		if (!guest_perform_sync())
+			break;
+
+		ptr += MEM_TEST_UNMAP_SIZE / 2;
+		*(uint64_t *)ptr = MEM_TEST_VAL_2;
+
+		if (!guest_perform_sync())
+			break;
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_code_test_memslot_rw(void)
+{
+	GUEST_SYNC(0);
+
+	guest_spin_until_start();
+
+	while (1) {
+		uintptr_t ptr;
+
+		for (ptr = MEM_TEST_GPA;
+		     ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096)
+			*(uint64_t *)ptr = MEM_TEST_VAL_1;
+
+		if (!guest_perform_sync())
+			break;
+
+		for (ptr = MEM_TEST_GPA + 4096 / 2;
+		     ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) {
+			uint64_t val = *(uint64_t *)ptr;
+
+			GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val);
+			*(uint64_t *)ptr = 0;
+		}
+
+		if (!guest_perform_sync())
+			break;
+	}
+
+	GUEST_DONE();
+}
+
+static bool test_memslot_move_prepare(struct vm_data *data,
+				      struct sync_area *sync,
+				      uint64_t *maxslots, bool isactive)
+{
+	uint64_t movesrcgpa, movetestgpa;
+
+	movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
+
+	if (isactive) {
+		uint64_t lastpages;
+
+		vm_gpa2hva(data, movesrcgpa, &lastpages);
+		if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) {
+			*maxslots = 0;
+			return false;
+		}
+	}
+
+	movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1));
+	sync->move_area_ptr = (void *)movetestgpa;
+
+	if (isactive) {
+		data->mmio_ok = true;
+		data->mmio_gpa_min = movesrcgpa;
+		data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1;
+	}
+
+	return true;
+}
+
+static bool test_memslot_move_prepare_active(struct vm_data *data,
+					     struct sync_area *sync,
+					     uint64_t *maxslots)
+{
+	return test_memslot_move_prepare(data, sync, maxslots, true);
+}
+
+static bool test_memslot_move_prepare_inactive(struct vm_data *data,
+					       struct sync_area *sync,
+					       uint64_t *maxslots)
+{
+	return test_memslot_move_prepare(data, sync, maxslots, false);
+}
+
+static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync)
+{
+	uint64_t movesrcgpa;
+
+	movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
+	vm_mem_region_move(data->vm, data->nslots - 1 + 1,
+			   MEM_TEST_MOVE_GPA_DEST);
+	vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa);
+}
+
+static void test_memslot_do_unmap(struct vm_data *data,
+				  uint64_t offsp, uint64_t count)
+{
+	uint64_t gpa, ctr;
+
+	for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) {
+		uint64_t npages;
+		void *hva;
+		int ret;
+
+		hva = vm_gpa2hva(data, gpa, &npages);
+		TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa);
+		npages = min(npages, count - ctr);
+		ret = madvise(hva, npages * 4096, MADV_DONTNEED);
+		TEST_ASSERT(!ret,
+			    "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64,
+			    hva, gpa);
+		ctr += npages;
+		gpa += npages * 4096;
+	}
+	TEST_ASSERT(ctr == count,
+		    "madvise(MADV_DONTNEED) should exactly cover all of the requested area");
+}
+
+static void test_memslot_map_unmap_check(struct vm_data *data,
+					 uint64_t offsp, uint64_t valexp)
+{
+	uint64_t gpa;
+	uint64_t *val;
+
+	if (!map_unmap_verify)
+		return;
+
+	gpa = MEM_TEST_GPA + offsp * 4096;
+	val = (typeof(val))vm_gpa2hva(data, gpa, NULL);
+	TEST_ASSERT(*val == valexp,
+		    "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")",
+		    *val, valexp, gpa);
+	*val = 0;
+}
+
+static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync)
+{
+	/*
+	 * Unmap the second half of the test area while guest writes to (maps)
+	 * the first half.
+	 */
+	test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2,
+			      MEM_TEST_MAP_SIZE_PAGES / 2);
+
+	/*
+	 * Wait for the guest to finish writing the first half of the test
+	 * area, verify the written value on the first and the last page of
+	 * this area and then unmap it.
+	 * Meanwhile, the guest is writing to (mapping) the second half of
+	 * the test area.
+	 */
+	host_perform_sync(sync);
+	test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1);
+	test_memslot_map_unmap_check(data,
+				     MEM_TEST_MAP_SIZE_PAGES / 2 - 1,
+				     MEM_TEST_VAL_1);
+	test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2);
+
+
+	/*
+	 * Wait for the guest to finish writing the second half of the test
+	 * area and verify the written value on the first and the last page
+	 * of this area.
+	 * The area will be unmapped at the beginning of the next loop
+	 * iteration.
+	 * Meanwhile, the guest is writing to (mapping) the first half of
+	 * the test area.
+	 */
+	host_perform_sync(sync);
+	test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2,
+				     MEM_TEST_VAL_2);
+	test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1,
+				     MEM_TEST_VAL_2);
+}
+
+static void test_memslot_unmap_loop_common(struct vm_data *data,
+					   struct sync_area *sync,
+					   uint64_t chunk)
+{
+	uint64_t ctr;
+
+	/*
+	 * Wait for the guest to finish mapping page(s) in the first half
+	 * of the test area, verify the written value and then perform unmap
+	 * of this area.
+	 * Meanwhile, the guest is writing to (mapping) page(s) in the second
+	 * half of the test area.
+	 */
+	host_perform_sync(sync);
+	test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1);
+	for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk)
+		test_memslot_do_unmap(data, ctr, chunk);
+
+	/* Likewise, but for the opposite host / guest areas */
+	host_perform_sync(sync);
+	test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2,
+				     MEM_TEST_VAL_2);
+	for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2;
+	     ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk)
+		test_memslot_do_unmap(data, ctr, chunk);
+}
+
+static void test_memslot_unmap_loop(struct vm_data *data,
+				    struct sync_area *sync)
+{
+	test_memslot_unmap_loop_common(data, sync, 1);
+}
+
+static void test_memslot_unmap_loop_chunked(struct vm_data *data,
+					    struct sync_area *sync)
+{
+	test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES);
+}
+
+static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync)
+{
+	uint64_t gptr;
+
+	for (gptr = MEM_TEST_GPA + 4096 / 2;
+	     gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096)
+		*(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2;
+
+	host_perform_sync(sync);
+
+	for (gptr = MEM_TEST_GPA;
+	     gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) {
+		uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL);
+		uint64_t val = *vptr;
+
+		TEST_ASSERT(val == MEM_TEST_VAL_1,
+			    "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")",
+			    val, gptr);
+		*vptr = 0;
+	}
+
+	host_perform_sync(sync);
+}
+
+struct test_data {
+	const char *name;
+	uint64_t mem_size;
+	void (*guest_code)(void);
+	bool (*prepare)(struct vm_data *data, struct sync_area *sync,
+			uint64_t *maxslots);
+	void (*loop)(struct vm_data *data, struct sync_area *sync);
+};
+
+static bool test_execute(int nslots, uint64_t *maxslots,
+			 unsigned int maxtime,
+			 const struct test_data *tdata,
+			 uint64_t *nloops,
+			 struct timespec *slot_runtime,
+			 struct timespec *guest_runtime)
+{
+	uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES;
+	struct vm_data *data;
+	struct sync_area *sync;
+	struct timespec tstart;
+	bool ret = true;
+
+	data = alloc_vm();
+	if (!prepare_vm(data, nslots, maxslots, tdata->guest_code,
+			mem_size, slot_runtime)) {
+		ret = false;
+		goto exit_free;
+	}
+
+	sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL);
+
+	if (tdata->prepare &&
+	    !tdata->prepare(data, sync, maxslots)) {
+		ret = false;
+		goto exit_free;
+	}
+
+	launch_vm(data);
+
+	clock_gettime(CLOCK_MONOTONIC, &tstart);
+	let_guest_run(sync);
+
+	while (1) {
+		*guest_runtime = timespec_elapsed(tstart);
+		if (guest_runtime->tv_sec >= maxtime)
+			break;
+
+		tdata->loop(data, sync);
+
+		(*nloops)++;
+	}
+
+	make_guest_exit(sync);
+	wait_guest_exit(data);
+
+exit_free:
+	free_vm(data);
+
+	return ret;
+}
+
+static const struct test_data tests[] = {
+	{
+		.name = "map",
+		.mem_size = MEM_SIZE_MAP_PAGES,
+		.guest_code = guest_code_test_memslot_map,
+		.loop = test_memslot_map_loop,
+	},
+	{
+		.name = "unmap",
+		.mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1,
+		.guest_code = guest_code_test_memslot_unmap,
+		.loop = test_memslot_unmap_loop,
+	},
+	{
+		.name = "unmap chunked",
+		.mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1,
+		.guest_code = guest_code_test_memslot_unmap,
+		.loop = test_memslot_unmap_loop_chunked,
+	},
+	{
+		.name = "move active area",
+		.guest_code = guest_code_test_memslot_move,
+		.prepare = test_memslot_move_prepare_active,
+		.loop = test_memslot_move_loop,
+	},
+	{
+		.name = "move inactive area",
+		.guest_code = guest_code_test_memslot_move,
+		.prepare = test_memslot_move_prepare_inactive,
+		.loop = test_memslot_move_loop,
+	},
+	{
+		.name = "RW",
+		.guest_code = guest_code_test_memslot_rw,
+		.loop = test_memslot_rw_loop
+	},
+};
+
+#define NTESTS ARRAY_SIZE(tests)
+
+struct test_args {
+	int tfirst;
+	int tlast;
+	int nslots;
+	int seconds;
+	int runs;
+};
+
+static void help(char *name, struct test_args *targs)
+{
+	int ctr;
+
+	pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n",
+		name);
+	pr_info(" -h: print this help screen.\n");
+	pr_info(" -v: enable verbose mode (not for benchmarking).\n");
+	pr_info(" -d: enable extra debug checks.\n");
+	pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n",
+		targs->nslots);
+	pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n",
+		targs->tfirst, NTESTS - 1);
+	pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n",
+		targs->tlast, NTESTS - 1);
+	pr_info(" -l: specify the test length in seconds (currently: %i)\n",
+		targs->seconds);
+	pr_info(" -r: specify the number of runs per test (currently: %i)\n",
+		targs->runs);
+
+	pr_info("\nAvailable tests:\n");
+	for (ctr = 0; ctr < NTESTS; ctr++)
+		pr_info("%d: %s\n", ctr, tests[ctr].name);
+}
+
+static bool parse_args(int argc, char *argv[],
+		       struct test_args *targs)
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
+		switch (opt) {
+		case 'h':
+		default:
+			help(argv[0], targs);
+			return false;
+		case 'v':
+			verbose = true;
+			break;
+		case 'd':
+			map_unmap_verify = true;
+			break;
+		case 's':
+			targs->nslots = atoi(optarg);
+			if (targs->nslots <= 0 && targs->nslots != -1) {
+				pr_info("Slot count cap has to be positive or -1 for no cap\n");
+				return false;
+			}
+			break;
+		case 'f':
+			targs->tfirst = atoi(optarg);
+			if (targs->tfirst < 0) {
+				pr_info("First test to run has to be non-negative\n");
+				return false;
+			}
+			break;
+		case 'e':
+			targs->tlast = atoi(optarg);
+			if (targs->tlast < 0 || targs->tlast >= NTESTS) {
+				pr_info("Last test to run has to be non-negative and less than %zu\n",
+					NTESTS);
+				return false;
+			}
+			break;
+		case 'l':
+			targs->seconds = atoi(optarg);
+			if (targs->seconds < 0) {
+				pr_info("Test length in seconds has to be non-negative\n");
+				return false;
+			}
+			break;
+		case 'r':
+			targs->runs = atoi(optarg);
+			if (targs->runs <= 0) {
+				pr_info("Runs per test has to be positive\n");
+				return false;
+			}
+			break;
+		}
+	}
+
+	if (optind < argc) {
+		help(argv[0], targs);
+		return false;
+	}
+
+	if (targs->tfirst > targs->tlast) {
+		pr_info("First test to run cannot be greater than the last test to run\n");
+		return false;
+	}
+
+	return true;
+}
+
+struct test_result {
+	struct timespec slot_runtime, guest_runtime, iter_runtime;
+	int64_t slottimens, runtimens;
+	uint64_t nloops;
+};
+
+static bool test_loop(const struct test_data *data,
+		      const struct test_args *targs,
+		      struct test_result *rbestslottime,
+		      struct test_result *rbestruntime)
+{
+	uint64_t maxslots;
+	struct test_result result;
+
+	result.nloops = 0;
+	if (!test_execute(targs->nslots, &maxslots, targs->seconds, data,
+			  &result.nloops,
+			  &result.slot_runtime, &result.guest_runtime)) {
+		if (maxslots)
+			pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n",
+				maxslots);
+		else
+			pr_info("Memslot count may be too high for this test, try adjusting the cap\n");
+
+		return false;
+	}
+
+	pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n",
+		result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec,
+		result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec);
+	if (!result.nloops) {
+		pr_info("No full loops done - too short test time or system too loaded?\n");
+		return true;
+	}
+
+	result.iter_runtime = timespec_div(result.guest_runtime,
+					   result.nloops);
+	pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n",
+		result.nloops,
+		result.iter_runtime.tv_sec,
+		result.iter_runtime.tv_nsec);
+	result.slottimens = timespec_to_ns(result.slot_runtime);
+	result.runtimens = timespec_to_ns(result.iter_runtime);
+
+	/*
+	 * Only rank the slot setup time for tests using the whole test memory
+	 * area so they are comparable
+	 */
+	if (!data->mem_size &&
+	    (!rbestslottime->slottimens ||
+	     result.slottimens < rbestslottime->slottimens))
+		*rbestslottime = result;
+	if (!rbestruntime->runtimens ||
+	    result.runtimens < rbestruntime->runtimens)
+		*rbestruntime = result;
+
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	struct test_args targs = {
+		.tfirst = 0,
+		.tlast = NTESTS - 1,
+		.nslots = -1,
+		.seconds = 5,
+		.runs = 20,
+	};
+	struct test_result rbestslottime;
+	int tctr;
+
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	if (!parse_args(argc, argv, &targs))
+		return -1;
+
+	rbestslottime.slottimens = 0;
+	for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) {
+		const struct test_data *data = &tests[tctr];
+		unsigned int runctr;
+		struct test_result rbestruntime;
+
+		if (tctr > targs.tfirst)
+			pr_info("\n");
+
+		pr_info("Testing %s performance with %i runs, %d seconds each\n",
+			data->name, targs.runs, targs.seconds);
+
+		rbestruntime.runtimens = 0;
+		for (runctr = 0; runctr < targs.runs; runctr++)
+			if (!test_loop(data, &targs,
+				       &rbestslottime, &rbestruntime))
+				break;
+
+		if (rbestruntime.runtimens)
+			pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n",
+				rbestruntime.iter_runtime.tv_sec,
+				rbestruntime.iter_runtime.tv_nsec,
+				rbestruntime.nloops);
+	}
+
+	if (rbestslottime.slottimens)
+		pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n",
+			rbestslottime.slot_runtime.tv_sec,
+			rbestslottime.slot_runtime.tv_nsec);
+
+	return 0;
+}
-- 
cgit v1.2.3


From ef4c9f4f654622fa15b7a94a9bd1f19e76bb7feb Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 21 May 2021 17:38:28 +0000
Subject: KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()

vm_get_max_gfn() casts vm->max_gfn from a uint64_t to an unsigned int,
which causes the upper 32-bits of the max_gfn to get truncated.

Nobody noticed until now likely because vm_get_max_gfn() is only used
as a mechanism to create a memslot in an unused region of the guest
physical address space (the top), and the top of the 32-bit physical
address space was always good enough.

This fix reveals a bug in memslot_modification_stress_test which was
trying to create a dummy memslot past the end of guest physical memory.
Fix that by moving the dummy memslot lower.

Fixes: 52200d0d944e ("KVM: selftests: Remove duplicate guest mode handling")
Reviewed-by: Venkatesh Srinivas <venkateshs@chromium.org>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210521173828.1180619-1-dmatlack@google.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h         |  2 +-
 tools/testing/selftests/kvm/lib/kvm_util.c             |  2 +-
 tools/testing/selftests/kvm/lib/perf_test_util.c       |  4 +++-
 .../selftests/kvm/memslot_modification_stress_test.c   | 18 +++++++++++-------
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index a8f022794ce3..2e0d253dabd6 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -302,7 +302,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
 unsigned int vm_get_page_size(struct kvm_vm *vm);
 unsigned int vm_get_page_shift(struct kvm_vm *vm);
-unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+uint64_t vm_get_max_gfn(struct kvm_vm *vm);
 int vm_get_fd(struct kvm_vm *vm);
 
 unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 1255744758e3..ea3f0db85b3e 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2117,7 +2117,7 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm)
 	return vm->page_shift;
 }
 
-unsigned int vm_get_max_gfn(struct kvm_vm *vm)
+uint64_t vm_get_max_gfn(struct kvm_vm *vm)
 {
 	return vm->max_gfn;
 }
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index 81490b9b4e32..abf381800a59 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 2020, Google LLC.
  */
+#include <inttypes.h>
 
 #include "kvm_util.h"
 #include "perf_test_util.h"
@@ -80,7 +81,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
 	 */
 	TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
 		    "Requested more guest memory than address space allows.\n"
-		    "    guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
+		    "    guest pages: %" PRIx64 " max gfn: %" PRIx64
+		    " vcpus: %d wss: %" PRIx64 "]\n",
 		    guest_num_pages, vm_get_max_gfn(vm), vcpus,
 		    vcpu_memory_bytes);
 
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 6096bf0a5b34..98351ba0933c 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -71,14 +71,22 @@ struct memslot_antagonist_args {
 };
 
 static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
-			      uint64_t nr_modifications, uint64_t gpa)
+			       uint64_t nr_modifications)
 {
+	const uint64_t pages = 1;
+	uint64_t gpa;
 	int i;
 
+	/*
+	 * Add the dummy memslot just below the perf_test_util memslot, which is
+	 * at the top of the guest physical address space.
+	 */
+	gpa = guest_test_phys_mem - pages * vm_get_page_size(vm);
+
 	for (i = 0; i < nr_modifications; i++) {
 		usleep(delay);
 		vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa,
-					    DUMMY_MEMSLOT_INDEX, 1, 0);
+					    DUMMY_MEMSLOT_INDEX, pages, 0);
 
 		vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX);
 	}
@@ -120,11 +128,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	pr_info("Started all vCPUs\n");
 
 	add_remove_memslot(vm, p->memslot_modification_delay,
-			   p->nr_memslot_modifications,
-			   guest_test_phys_mem +
-			   (guest_percpu_mem_size * nr_vcpus) +
-			   perf_test_args.host_page_size +
-			   perf_test_args.guest_page_size);
+			   p->nr_memslot_modifications);
 
 	run_vcpus = false;
 
-- 
cgit v1.2.3


From 50bc913d526beb9937f1eb0159ec63c43234f961 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 19 May 2021 21:13:45 +0000
Subject: KVM: selftests: Ignore CPUID.0DH.1H in get_cpuid_test

Similar to CPUID.0DH.0H this entry depends on the vCPU's XCR0 register
and IA32_XSS MSR. Since this test does not control for either before
assigning the vCPU's CPUID, these entries will not necessarily match
the supported CPUID exposed by KVM.

This fixes get_cpuid_test on Cascade Lake CPUs.

Suggested-by: Jim Mattson <jmattson@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210519211345.3944063-1-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86_64/get_cpuid_test.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
index 9b78e8889638..8c77537af5a1 100644
--- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
@@ -19,7 +19,12 @@ struct {
 	u32 function;
 	u32 index;
 } mangled_cpuids[] = {
+	/*
+	 * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR,
+	 * which are not controlled for by this test.
+	 */
 	{.function = 0xd, .index = 0},
+	{.function = 0xd, .index = 1},
 };
 
 static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
-- 
cgit v1.2.3


From a10453c038a7e97169185405242d20d21de0bb91 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 14 May 2021 23:05:21 +0000
Subject: KVM: selftests: Fix hang in hardware_disable_test

If /dev/kvm is not available then hardware_disable_test will hang
indefinitely because the child process exits before posting to the
semaphore for which the parent is waiting.

Fix this by making the parent periodically check if the child has
exited. We have to be careful to forward the child's exit status to
preserve a KSFT_SKIP status.

I considered just checking for /dev/kvm before creating the child
process, but there are so many other reasons why the child could exit
early that it seemed better to handle that as general case.

Tested:

$ ./hardware_disable_test
/dev/kvm not available, skipping test
$ echo $?
4
$ modprobe kvm_intel
$ ./hardware_disable_test
$ echo $?
0

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210514230521.2608768-1-dmatlack@google.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../testing/selftests/kvm/hardware_disable_test.c  | 32 +++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c
index 5aadf84c91c0..4b8db3bce610 100644
--- a/tools/testing/selftests/kvm/hardware_disable_test.c
+++ b/tools/testing/selftests/kvm/hardware_disable_test.c
@@ -132,6 +132,36 @@ static void run_test(uint32_t run)
 	TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run);
 }
 
+void wait_for_child_setup(pid_t pid)
+{
+	/*
+	 * Wait for the child to post to the semaphore, but wake up periodically
+	 * to check if the child exited prematurely.
+	 */
+	for (;;) {
+		const struct timespec wait_period = { .tv_sec = 1 };
+		int status;
+
+		if (!sem_timedwait(sem, &wait_period))
+			return;
+
+		/* Child is still running, keep waiting. */
+		if (pid != waitpid(pid, &status, WNOHANG))
+			continue;
+
+		/*
+		 * Child is no longer running, which is not expected.
+		 *
+		 * If it exited with a non-zero status, we explicitly forward
+		 * the child's status in case it exited with KSFT_SKIP.
+		 */
+		if (WIFEXITED(status))
+			exit(WEXITSTATUS(status));
+		else
+			TEST_ASSERT(false, "Child exited unexpectedly");
+	}
+}
+
 int main(int argc, char **argv)
 {
 	uint32_t i;
@@ -148,7 +178,7 @@ int main(int argc, char **argv)
 			run_test(i); /* This function always exits */
 
 		pr_debug("%s: [%d] waiting semaphore\n", __func__, i);
-		sem_wait(sem);
+		wait_for_child_setup(pid);
 		r = (rand() % DELAY_US_MAX) + 1;
 		pr_debug("%s: [%d] waiting %dus\n", __func__, i, r);
 		usleep(r);
-- 
cgit v1.2.3


From c887d6a126dfc50b27872527615dd46cb3d96bc1 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:30 -0700
Subject: KVM: selftests: trivial comment/logging fixes

Some trivial fixes I found while touching related code in this series,
factored out into a separate commit for easier reviewing:

- s/gor/got/ and add a newline in demand_paging_test.c
- s/backing_src/src_type/ in a comment to be consistent with the real
  function signature in kvm_util.c

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-2-axelrasmussen@google.com>
Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/demand_paging_test.c | 2 +-
 tools/testing/selftests/kvm/lib/kvm_util.c       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..9398ba6ef023 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -169,7 +169,7 @@ static void *uffd_handler_thread_fn(void *arg)
 		if (r == -1) {
 			if (errno == EAGAIN)
 				continue;
-			pr_info("Read of uffd gor errno %d", errno);
+			pr_info("Read of uffd got errno %d\n", errno);
 			return NULL;
 		}
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index ea3f0db85b3e..f4484e1edcfa 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -731,8 +731,8 @@ static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
  *
  * Input Args:
  *   vm - Virtual Machine
- *   backing_src - Storage source for this region.
- *                 NULL to use anonymous memory.
+ *   src_type - Storage source for this region.
+ *              NULL to use anonymous memory.
  *   guest_paddr - Starting guest physical address
  *   slot - KVM region slot
  *   npages - Number of physical pages
-- 
cgit v1.2.3


From 2aab4b355cbbe1deacfd9349729c43509042b557 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Tue, 11 May 2021 20:21:20 +0000
Subject: KVM: selftests: Print a message if /dev/kvm is missing

If a KVM selftest is run on a machine without /dev/kvm, it will exit
silently. Make it easy to tell what's happening by printing an error
message.

Opportunistically consolidate all codepaths that open /dev/kvm into a
single function so they all print the same message.

This slightly changes the semantics of vm_is_unrestricted_guest() by
changing a TEST_ASSERT() to exit(KSFT_SKIP). However
vm_is_unrestricted_guest() is only called in one place
(x86_64/mmio_warning_test.c) and that is to determine if the test should
be skipped or not.

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210511202120.1371800-1-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c         | 46 +++++++++++++++-------
 tools/testing/selftests/kvm/lib/x86_64/processor.c | 16 ++------
 .../selftests/kvm/x86_64/get_msr_index_features.c  |  8 +---
 4 files changed, 39 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 2e0d253dabd6..5d9b35d09251 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -77,6 +77,7 @@ struct vm_guest_mode_params {
 };
 extern const struct vm_guest_mode_params vm_guest_mode_params[];
 
+int open_kvm_dev_path_or_exit(void);
 int kvm_check_cap(long cap);
 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index f4484e1edcfa..d00e49b73d68 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -31,6 +31,34 @@ static void *align(void *x, size_t size)
 	return (void *) (((size_t) x + mask) & ~mask);
 }
 
+/*
+ * Open KVM_DEV_PATH if available, otherwise exit the entire program.
+ *
+ * Input Args:
+ *   flags - The flags to pass when opening KVM_DEV_PATH.
+ *
+ * Return:
+ *   The opened file descriptor of /dev/kvm.
+ */
+static int _open_kvm_dev_path_or_exit(int flags)
+{
+	int fd;
+
+	fd = open(KVM_DEV_PATH, flags);
+	if (fd < 0) {
+		print_skip("%s not available, is KVM loaded? (errno: %d)",
+			   KVM_DEV_PATH, errno);
+		exit(KSFT_SKIP);
+	}
+
+	return fd;
+}
+
+int open_kvm_dev_path_or_exit(void)
+{
+	return _open_kvm_dev_path_or_exit(O_RDONLY);
+}
+
 /*
  * Capability
  *
@@ -52,10 +80,7 @@ int kvm_check_cap(long cap)
 	int ret;
 	int kvm_fd;
 
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
-
+	kvm_fd = open_kvm_dev_path_or_exit();
 	ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
 	TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n"
 		"  rc: %i errno: %i", ret, errno);
@@ -128,9 +153,7 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
 
 static void vm_open(struct kvm_vm *vm, int perm)
 {
-	vm->kvm_fd = open(KVM_DEV_PATH, perm);
-	if (vm->kvm_fd < 0)
-		exit(KSFT_SKIP);
+	vm->kvm_fd = _open_kvm_dev_path_or_exit(perm);
 
 	if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
 		print_skip("immediate_exit not available");
@@ -996,9 +1019,7 @@ static int vcpu_mmap_sz(void)
 {
 	int dev_fd, ret;
 
-	dev_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (dev_fd < 0)
-		exit(KSFT_SKIP);
+	dev_fd = open_kvm_dev_path_or_exit();
 
 	ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
 	TEST_ASSERT(ret >= sizeof(struct kvm_run),
@@ -2091,10 +2112,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm)
 
 	if (vm == NULL) {
 		/* Ensure that the KVM vendor-specific module is loaded. */
-		f = fopen(KVM_DEV_PATH, "r");
-		TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d",
-			    errno);
-		fclose(f);
+		close(open_kvm_dev_path_or_exit());
 	}
 
 	f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r");
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index a8906e60a108..efe235044421 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -657,9 +657,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
 		return cpuid;
 
 	cpuid = allocate_kvm_cpuid2();
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
+	kvm_fd = open_kvm_dev_path_or_exit();
 
 	ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
 	TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
@@ -691,9 +689,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index)
 
 	buffer.header.nmsrs = 1;
 	buffer.entry.index = msr_index;
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
+	kvm_fd = open_kvm_dev_path_or_exit();
 
 	r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
 	TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
@@ -986,9 +982,7 @@ struct kvm_msr_list *kvm_get_msr_index_list(void)
 	struct kvm_msr_list *list;
 	int nmsrs, r, kvm_fd;
 
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
+	kvm_fd = open_kvm_dev_path_or_exit();
 
 	nmsrs = kvm_get_num_msrs_fd(kvm_fd);
 	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
@@ -1312,9 +1306,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
 		return cpuid;
 
 	cpuid = allocate_kvm_cpuid2();
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
+	kvm_fd = open_kvm_dev_path_or_exit();
 
 	ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
 	TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n",
diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
index cb953df4d7d0..8aed0db1331d 100644
--- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
+++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
@@ -37,9 +37,7 @@ static void test_get_msr_index(void)
 	int old_res, res, kvm_fd, r;
 	struct kvm_msr_list *list;
 
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
+	kvm_fd = open_kvm_dev_path_or_exit();
 
 	old_res = kvm_num_index_msrs(kvm_fd, 0);
 	TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");
@@ -101,9 +99,7 @@ static void test_get_msr_feature(void)
 	int res, old_res, i, kvm_fd;
 	struct kvm_msr_list *feature_list;
 
-	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
-	if (kvm_fd < 0)
-		exit(KSFT_SKIP);
+	kvm_fd = open_kvm_dev_path_or_exit();
 
 	old_res = kvm_num_feature_msrs(kvm_fd, 0);
 	TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");
-- 
cgit v1.2.3


From 25408e5a0246048e3e36d2cd513565ebcc481f51 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:31 -0700
Subject: KVM: selftests: simplify setup_demand_paging error handling

A small cleanup. Our caller writes:

  r = setup_demand_paging(...);
  if (r < 0) exit(-r);

Since we're just going to exit anyway, instead of returning an error we
can just re-use TEST_ASSERT. This makes the caller simpler, as well as
the function itself - no need to write our branches, etc.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-3-axelrasmussen@google.com>
Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/demand_paging_test.c | 50 +++++++++---------------
 1 file changed, 18 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 9398ba6ef023..8ce53488d6af 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -9,6 +9,7 @@
 
 #define _GNU_SOURCE /* for pipe2 */
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -198,42 +199,32 @@ static void *uffd_handler_thread_fn(void *arg)
 	return NULL;
 }
 
-static int setup_demand_paging(struct kvm_vm *vm,
-			       pthread_t *uffd_handler_thread, int pipefd,
-			       useconds_t uffd_delay,
-			       struct uffd_handler_args *uffd_args,
-			       void *hva, uint64_t len)
+static void setup_demand_paging(struct kvm_vm *vm,
+				pthread_t *uffd_handler_thread, int pipefd,
+				useconds_t uffd_delay,
+				struct uffd_handler_args *uffd_args,
+				void *hva, uint64_t len)
 {
 	int uffd;
 	struct uffdio_api uffdio_api;
 	struct uffdio_register uffdio_register;
 
 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
-	if (uffd == -1) {
-		pr_info("uffd creation failed\n");
-		return -1;
-	}
+	TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
 
 	uffdio_api.api = UFFD_API;
 	uffdio_api.features = 0;
-	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
-		pr_info("ioctl uffdio_api failed\n");
-		return -1;
-	}
+	TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
+		    "ioctl UFFDIO_API failed: %" PRIu64,
+		    (uint64_t)uffdio_api.api);
 
 	uffdio_register.range.start = (uint64_t)hva;
 	uffdio_register.range.len = len;
 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
-		pr_info("ioctl uffdio_register failed\n");
-		return -1;
-	}
-
-	if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) !=
-			UFFD_API_RANGE_IOCTLS) {
-		pr_info("unexpected userfaultfd ioctl set\n");
-		return -1;
-	}
+	TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
+		    "ioctl UFFDIO_REGISTER failed");
+	TEST_ASSERT((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) ==
+		    UFFD_API_RANGE_IOCTLS, "unexpected userfaultfd ioctl set");
 
 	uffd_args->uffd = uffd;
 	uffd_args->pipefd = pipefd;
@@ -243,8 +234,6 @@ static int setup_demand_paging(struct kvm_vm *vm,
 
 	PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
 		       hva, hva + len);
-
-	return 0;
 }
 
 struct test_params {
@@ -321,13 +310,10 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 				  O_CLOEXEC | O_NONBLOCK);
 			TEST_ASSERT(!r, "Failed to set up pipefd");
 
-			r = setup_demand_paging(vm,
-						&uffd_handler_threads[vcpu_id],
-						pipefds[vcpu_id * 2],
-						p->uffd_delay, &uffd_args[vcpu_id],
-						vcpu_hva, vcpu_mem_size);
-			if (r < 0)
-				exit(-r);
+			setup_demand_paging(vm, &uffd_handler_threads[vcpu_id],
+					    pipefds[vcpu_id * 2], p->uffd_delay,
+					    &uffd_args[vcpu_id], vcpu_hva,
+					    vcpu_mem_size);
 		}
 	}
 
-- 
cgit v1.2.3


From 32ffa4f71e10009498ae6b54da65ab316db967bd Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:33 -0700
Subject: KVM: selftests: compute correct demand paging size

This is a preparatory commit needed before we can use different kinds of
backing pages for guest memory.

Previously, we used perf_test_args.host_page_size, which is the host's
native page size (commonly 4K). For VM_MEM_SRC_ANONYMOUS this turns out
to be okay, but in a follow-up commit we want to allow using different
kinds of backing memory.

Take VM_MEM_SRC_ANONYMOUS_HUGETLB for example. Without this change, if
we used that backing page type, when we issued a UFFDIO_COPY ioctl we'd
only do so with 4K, rather than the full 2M of a backing hugepage. In
this case, UFFDIO_COPY returns -EINVAL (__mcopy_atomic_hugetlb checks
the size).

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-5-axelrasmussen@google.com>
Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/demand_paging_test.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 8ce53488d6af..e6582f504c0f 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -39,6 +39,7 @@
 
 static int nr_vcpus = 1;
 static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+static size_t demand_paging_size;
 static char *guest_data_prototype;
 
 static void *vcpu_worker(void *data)
@@ -84,7 +85,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
 
 	copy.src = (uint64_t)guest_data_prototype;
 	copy.dst = addr;
-	copy.len = perf_test_args.host_page_size;
+	copy.len = demand_paging_size;
 	copy.mode = 0;
 
 	clock_gettime(CLOCK_MONOTONIC, &start);
@@ -101,7 +102,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
 	PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
 		       timespec_to_ns(ts_diff));
 	PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
-		       perf_test_args.host_page_size, addr, tid);
+		       demand_paging_size, addr, tid);
 
 	return 0;
 }
@@ -260,10 +261,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 	perf_test_args.wr_fract = 1;
 
-	guest_data_prototype = malloc(perf_test_args.host_page_size);
+	demand_paging_size = get_backing_src_pagesz(VM_MEM_SRC_ANONYMOUS);
+
+	guest_data_prototype = malloc(demand_paging_size);
 	TEST_ASSERT(guest_data_prototype,
 		    "Failed to allocate buffer for guest data pattern");
-	memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size);
+	memset(guest_data_prototype, 0xAB, demand_paging_size);
 
 	vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
 	TEST_ASSERT(vcpu_threads, "Memory allocation failed");
-- 
cgit v1.2.3


From 0368c2c1b422c94968b5286f289aed7fe6af93c2 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:34 -0700
Subject: KVM: selftests: allow different backing source types

Add an argument which lets us specify a different backing memory type
for the test. The default is just to use anonymous, matching existing
behavior.

This is in preparation for testing UFFD minor faults. For that, we'll
need to use a new backing memory type which is setup with MAP_SHARED.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-6-axelrasmussen@google.com>
Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/demand_paging_test.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index e6582f504c0f..8c03484a5784 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -240,6 +240,7 @@ static void setup_demand_paging(struct kvm_vm *vm,
 struct test_params {
 	bool use_uffd;
 	useconds_t uffd_delay;
+	enum vm_mem_backing_src_type src_type;
 	bool partition_vcpu_memory_access;
 };
 
@@ -257,11 +258,11 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	int r;
 
 	vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
-				 VM_MEM_SRC_ANONYMOUS);
+				 p->src_type);
 
 	perf_test_args.wr_fract = 1;
 
-	demand_paging_size = get_backing_src_pagesz(VM_MEM_SRC_ANONYMOUS);
+	demand_paging_size = get_backing_src_pagesz(p->src_type);
 
 	guest_data_prototype = malloc(demand_paging_size);
 	TEST_ASSERT(guest_data_prototype,
@@ -377,7 +378,7 @@ static void help(char *name)
 {
 	puts("");
 	printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
-	       "          [-b memory] [-v vcpus] [-o]\n", name);
+	       "          [-b memory] [-t type] [-v vcpus] [-o]\n", name);
 	guest_modes_help();
 	printf(" -u: use User Fault FD to handle vCPU page\n"
 	       "     faults.\n");
@@ -387,6 +388,8 @@ static void help(char *name)
 	printf(" -b: specify the size of the memory region which should be\n"
 	       "     demand paged by each vCPU. e.g. 10M or 3G.\n"
 	       "     Default: 1G\n");
+	printf(" -t: The type of backing memory to use. Default: anonymous\n");
+	backing_src_help();
 	printf(" -v: specify the number of vCPUs to run.\n");
 	printf(" -o: Overlap guest memory accesses instead of partitioning\n"
 	       "     them into a separate region of memory for each vCPU.\n");
@@ -398,13 +401,14 @@ int main(int argc, char *argv[])
 {
 	int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
 	struct test_params p = {
+		.src_type = VM_MEM_SRC_ANONYMOUS,
 		.partition_vcpu_memory_access = true,
 	};
 	int opt;
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) {
+	while ((opt = getopt(argc, argv, "hm:ud:b:t:v:o")) != -1) {
 		switch (opt) {
 		case 'm':
 			guest_modes_cmdline(optarg);
@@ -419,6 +423,9 @@ int main(int argc, char *argv[])
 		case 'b':
 			guest_percpu_mem_size = parse_size(optarg);
 			break;
+		case 't':
+			p.src_type = parse_backing_src_type(optarg);
+			break;
 		case 'v':
 			nr_vcpus = atoi(optarg);
 			TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
-- 
cgit v1.2.3


From b3784bc28ccc0d9b44d265a1d947c8766295ba00 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:35 -0700
Subject: KVM: selftests: refactor vm_mem_backing_src_type flags

Each struct vm_mem_backing_src_alias has a flags field, which denotes
the flags used to mmap() an area of that type. Previously, this field
never included MAP_PRIVATE | MAP_ANONYMOUS, because
vm_userspace_mem_region_add assumed that *all* types would always use
those flags, and so it hardcoded them.

In a follow-up commit, we'll add a new type: shmem. Areas of this type
must not have MAP_PRIVATE | MAP_ANONYMOUS, and instead they must have
MAP_SHARED.

So, refactor things. Make it so that the flags field of
struct vm_mem_backing_src_alias really is a complete set of flags, and
don't add in any extras in vm_userspace_mem_region_add. This will let us
easily tack on shmem.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-7-axelrasmussen@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c  |  3 +--
 tools/testing/selftests/kvm/lib/test_util.c | 35 ++++++++++++++++-------------
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index d00e49b73d68..491be22b410c 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -849,8 +849,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 
 	region->mmap_start = mmap(NULL, region->mmap_size,
 				  PROT_READ | PROT_WRITE,
-				  MAP_PRIVATE | MAP_ANONYMOUS
-				  | vm_mem_backing_src_alias(src_type)->flag,
+				  vm_mem_backing_src_alias(src_type)->flag,
 				  -1, 0);
 	TEST_ASSERT(region->mmap_start != MAP_FAILED,
 		    "test_malloc failed, mmap_start: %p errno: %i",
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 63d2bc7d757b..06ddde068736 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -168,70 +168,73 @@ size_t get_def_hugetlb_pagesz(void)
 
 const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
 {
+	static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+	static const int anon_huge_flags = anon_flags | MAP_HUGETLB;
+
 	static const struct vm_mem_backing_src_alias aliases[] = {
 		[VM_MEM_SRC_ANONYMOUS] = {
 			.name = "anonymous",
-			.flag = 0,
+			.flag = anon_flags,
 		},
 		[VM_MEM_SRC_ANONYMOUS_THP] = {
 			.name = "anonymous_thp",
-			.flag = 0,
+			.flag = anon_flags,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
 			.name = "anonymous_hugetlb",
-			.flag = MAP_HUGETLB,
+			.flag = anon_huge_flags,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
 			.name = "anonymous_hugetlb_16kb",
-			.flag = MAP_HUGETLB | MAP_HUGE_16KB,
+			.flag = anon_huge_flags | MAP_HUGE_16KB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
 			.name = "anonymous_hugetlb_64kb",
-			.flag = MAP_HUGETLB | MAP_HUGE_64KB,
+			.flag = anon_huge_flags | MAP_HUGE_64KB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
 			.name = "anonymous_hugetlb_512kb",
-			.flag = MAP_HUGETLB | MAP_HUGE_512KB,
+			.flag = anon_huge_flags | MAP_HUGE_512KB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
 			.name = "anonymous_hugetlb_1mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_1MB,
+			.flag = anon_huge_flags | MAP_HUGE_1MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
 			.name = "anonymous_hugetlb_2mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_2MB,
+			.flag = anon_huge_flags | MAP_HUGE_2MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
 			.name = "anonymous_hugetlb_8mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_8MB,
+			.flag = anon_huge_flags | MAP_HUGE_8MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
 			.name = "anonymous_hugetlb_16mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_16MB,
+			.flag = anon_huge_flags | MAP_HUGE_16MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
 			.name = "anonymous_hugetlb_32mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_32MB,
+			.flag = anon_huge_flags | MAP_HUGE_32MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
 			.name = "anonymous_hugetlb_256mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_256MB,
+			.flag = anon_huge_flags | MAP_HUGE_256MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
 			.name = "anonymous_hugetlb_512mb",
-			.flag = MAP_HUGETLB | MAP_HUGE_512MB,
+			.flag = anon_huge_flags | MAP_HUGE_512MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
 			.name = "anonymous_hugetlb_1gb",
-			.flag = MAP_HUGETLB | MAP_HUGE_1GB,
+			.flag = anon_huge_flags | MAP_HUGE_1GB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
 			.name = "anonymous_hugetlb_2gb",
-			.flag = MAP_HUGETLB | MAP_HUGE_2GB,
+			.flag = anon_huge_flags | MAP_HUGE_2GB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
 			.name = "anonymous_hugetlb_16gb",
-			.flag = MAP_HUGETLB | MAP_HUGE_16GB,
+			.flag = anon_huge_flags | MAP_HUGE_16GB,
 		},
 	};
 	_Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
-- 
cgit v1.2.3


From c9befd5958fdf8913db69049d47b6ac1d970af03 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:36 -0700
Subject: KVM: selftests: add shmem backing source type

This lets us run the demand paging test on top of a shmem-backed area.
In follow-up commits, we'll 1) leverage this new capability to create an
alias mapping, and then 2) use the alias mapping to exercise UFFD minor
faults.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-8-axelrasmussen@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/test_util.h |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c      | 17 ++++++++++++++++-
 tools/testing/selftests/kvm/lib/test_util.c     |  5 +++++
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index fade3130eb01..7377f00469ef 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -84,6 +84,7 @@ enum vm_mem_backing_src_type {
 	VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
 	VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
 	VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+	VM_MEM_SRC_SHMEM,
 	NUM_SRC_TYPES,
 };
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 491be22b410c..bc50ca6390d3 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -847,10 +847,25 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 	if (alignment > 1)
 		region->mmap_size += alignment;
 
+	region->fd = -1;
+	if (src_type == VM_MEM_SRC_SHMEM) {
+		region->fd = memfd_create("kvm_selftest", MFD_CLOEXEC);
+		TEST_ASSERT(region->fd != -1,
+			    "memfd_create failed, errno: %i", errno);
+
+		ret = ftruncate(region->fd, region->mmap_size);
+		TEST_ASSERT(ret == 0, "ftruncate failed, errno: %i", errno);
+
+		ret = fallocate(region->fd,
+				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+				region->mmap_size);
+		TEST_ASSERT(ret == 0, "fallocate failed, errno: %i", errno);
+	}
+
 	region->mmap_start = mmap(NULL, region->mmap_size,
 				  PROT_READ | PROT_WRITE,
 				  vm_mem_backing_src_alias(src_type)->flag,
-				  -1, 0);
+				  region->fd, 0);
 	TEST_ASSERT(region->mmap_start != MAP_FAILED,
 		    "test_malloc failed, mmap_start: %p errno: %i",
 		    region->mmap_start, errno);
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 06ddde068736..c7a265da5090 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -236,6 +236,10 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
 			.name = "anonymous_hugetlb_16gb",
 			.flag = anon_huge_flags | MAP_HUGE_16GB,
 		},
+		[VM_MEM_SRC_SHMEM] = {
+			.name = "shmem",
+			.flag = MAP_SHARED,
+		},
 	};
 	_Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
 		       "Missing new backing src types?");
@@ -253,6 +257,7 @@ size_t get_backing_src_pagesz(uint32_t i)
 
 	switch (i) {
 	case VM_MEM_SRC_ANONYMOUS:
+	case VM_MEM_SRC_SHMEM:
 		return getpagesize();
 	case VM_MEM_SRC_ANONYMOUS_THP:
 		return get_trans_hugepagesz();
-- 
cgit v1.2.3


From 94f3f2b31a8a9e8bd30bf6f4903ff84acc612e0e Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:37 -0700
Subject: KVM: selftests: create alias mappings when using shared memory

When a memory region is added with a src_type specifying that it should
use some kind of shared memory, also create an alias mapping to the same
underlying physical pages.

And, add an API so tests can get access to these alias addresses.
Basically, for a guest physical address, let us look up the analogous
host *alias* address.

In a future commit, we'll modify the demand paging test to take
advantage of this to exercise UFFD minor faults. The idea is, we
pre-fault the underlying pages *via the alias*. When the *guest*
faults, it gets a "minor" fault (PTEs don't exist yet, but a page is
already in the page cache). Then, the userfaultfd theads can handle the
fault: they could potentially modify the underlying memory *via the
alias* if they wanted to, and then they install the PTEs and let the
guest carry on via a UFFDIO_CONTINUE ioctl.

Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-9-axelrasmussen@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     |  1 +
 tools/testing/selftests/kvm/lib/kvm_util.c         | 49 ++++++++++++++++++++++
 .../testing/selftests/kvm/lib/kvm_util_internal.h  |  2 +
 3 files changed, 52 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 5d9b35d09251..fcd8e3855111 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -147,6 +147,7 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
 void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa);
 
 /*
  * Address Guest Virtual to Guest Physical
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index bc50ca6390d3..f62780719176 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -903,6 +903,19 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 	vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
 	vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
 	hash_add(vm->regions.slot_hash, &region->slot_node, slot);
+
+	/* If shared memory, create an alias. */
+	if (region->fd >= 0) {
+		region->mmap_alias = mmap(NULL, region->mmap_size,
+					  PROT_READ | PROT_WRITE,
+					  vm_mem_backing_src_alias(src_type)->flag,
+					  region->fd, 0);
+		TEST_ASSERT(region->mmap_alias != MAP_FAILED,
+			    "mmap of alias failed, errno: %i", errno);
+
+		/* Align host alias address */
+		region->host_alias = align(region->mmap_alias, alignment);
+	}
 }
 
 /*
@@ -1333,6 +1346,42 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
 	return -1;
 }
 
+/*
+ * Address VM physical to Host Virtual *alias*.
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   gpa - VM physical address
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Equivalent address within the host virtual *alias* area, or NULL
+ *   (without failing the test) if the guest memory is not shared (so
+ *   no alias exists).
+ *
+ * When vm_create() and related functions are called with a shared memory
+ * src_type, we also create a writable, shared alias mapping of the
+ * underlying guest memory. This allows the host to manipulate guest memory
+ * without mapping that memory in the guest's address space. And, for
+ * userfaultfd-based demand paging, we can do so without triggering userfaults.
+ */
+void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+	struct userspace_mem_region *region;
+	uintptr_t offset;
+
+	region = userspace_mem_region_find(vm, gpa, gpa);
+	if (!region)
+		return NULL;
+
+	if (!region->host_alias)
+		return NULL;
+
+	offset = gpa - region->region.guest_phys_addr;
+	return (void *) ((uintptr_t) region->host_alias + offset);
+}
+
 /*
  * VM Create IRQ Chip
  *
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index b30e8c7b119b..a03febc24ba6 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -19,7 +19,9 @@ struct userspace_mem_region {
 	int fd;
 	off_t offset;
 	void *host_mem;
+	void *host_alias;
 	void *mmap_start;
+	void *mmap_alias;
 	size_t mmap_size;
 	struct rb_node gpa_node;
 	struct rb_node hva_node;
-- 
cgit v1.2.3


From a4b9722a5996017264feb19ebe86efe4380f7afb Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:38 -0700
Subject: KVM: selftests: allow using UFFD minor faults for demand paging

UFFD handling of MINOR faults is a new feature whose use case is to
speed up demand paging (compared to MISSING faults). So, it's
interesting to let this selftest exercise this new mode.

Modify the demand paging test to have the option of using UFFD minor
faults, as opposed to missing faults. Now, when turning on userfaultfd
with '-u', the desired mode has to be specified ("MISSING" or "MINOR").

If we're in minor mode, before registering, prefault via the *alias*.
This way, the guest will trigger minor faults, instead of missing
faults, and we can UFFDIO_CONTINUE to resolve them.

Modify the page fault handler function to use the right ioctl depending
on the mode we're running in. In MINOR mode, use UFFDIO_CONTINUE.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-10-axelrasmussen@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/demand_paging_test.c | 112 ++++++++++++++++-------
 1 file changed, 79 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 8c03484a5784..fcba527c29a6 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -73,33 +73,48 @@ static void *vcpu_worker(void *data)
 	return NULL;
 }
 
-static int handle_uffd_page_request(int uffd, uint64_t addr)
+static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
 {
-	pid_t tid;
+	pid_t tid = syscall(__NR_gettid);
 	struct timespec start;
 	struct timespec ts_diff;
-	struct uffdio_copy copy;
 	int r;
 
-	tid = syscall(__NR_gettid);
+	clock_gettime(CLOCK_MONOTONIC, &start);
 
-	copy.src = (uint64_t)guest_data_prototype;
-	copy.dst = addr;
-	copy.len = demand_paging_size;
-	copy.mode = 0;
+	if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) {
+		struct uffdio_copy copy;
 
-	clock_gettime(CLOCK_MONOTONIC, &start);
+		copy.src = (uint64_t)guest_data_prototype;
+		copy.dst = addr;
+		copy.len = demand_paging_size;
+		copy.mode = 0;
+
+		r = ioctl(uffd, UFFDIO_COPY, &copy);
+		if (r == -1) {
+			pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n",
+				addr, tid, errno);
+			return r;
+		}
+	} else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) {
+		struct uffdio_continue cont = {0};
+
+		cont.range.start = addr;
+		cont.range.len = demand_paging_size;
 
-	r = ioctl(uffd, UFFDIO_COPY, &copy);
-	if (r == -1) {
-		pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n",
-			addr, tid, errno);
-		return r;
+		r = ioctl(uffd, UFFDIO_CONTINUE, &cont);
+		if (r == -1) {
+			pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n",
+				addr, tid, errno);
+			return r;
+		}
+	} else {
+		TEST_FAIL("Invalid uffd mode %d", uffd_mode);
 	}
 
 	ts_diff = timespec_elapsed(start);
 
-	PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
+	PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid,
 		       timespec_to_ns(ts_diff));
 	PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
 		       demand_paging_size, addr, tid);
@@ -110,6 +125,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
 bool quit_uffd_thread;
 
 struct uffd_handler_args {
+	int uffd_mode;
 	int uffd;
 	int pipefd;
 	useconds_t delay;
@@ -186,7 +202,7 @@ static void *uffd_handler_thread_fn(void *arg)
 		if (delay)
 			usleep(delay);
 		addr =  msg.arg.pagefault.address;
-		r = handle_uffd_page_request(uffd, addr);
+		r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr);
 		if (r < 0)
 			return NULL;
 		pages++;
@@ -202,13 +218,32 @@ static void *uffd_handler_thread_fn(void *arg)
 
 static void setup_demand_paging(struct kvm_vm *vm,
 				pthread_t *uffd_handler_thread, int pipefd,
-				useconds_t uffd_delay,
+				int uffd_mode, useconds_t uffd_delay,
 				struct uffd_handler_args *uffd_args,
-				void *hva, uint64_t len)
+				void *hva, void *alias, uint64_t len)
 {
+	bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
 	int uffd;
 	struct uffdio_api uffdio_api;
 	struct uffdio_register uffdio_register;
+	uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
+
+	PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
+		       is_minor ? "MINOR" : "MISSING",
+		       is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
+
+	/* In order to get minor faults, prefault via the alias. */
+	if (is_minor) {
+		size_t p;
+
+		expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
+
+		TEST_ASSERT(alias != NULL, "Alias required for minor faults");
+		for (p = 0; p < (len / demand_paging_size); ++p) {
+			memcpy(alias + (p * demand_paging_size),
+			       guest_data_prototype, demand_paging_size);
+		}
+	}
 
 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
 	TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
@@ -221,12 +256,13 @@ static void setup_demand_paging(struct kvm_vm *vm,
 
 	uffdio_register.range.start = (uint64_t)hva;
 	uffdio_register.range.len = len;
-	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+	uffdio_register.mode = uffd_mode;
 	TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
 		    "ioctl UFFDIO_REGISTER failed");
-	TEST_ASSERT((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) ==
-		    UFFD_API_RANGE_IOCTLS, "unexpected userfaultfd ioctl set");
+	TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
+		    expected_ioctls, "missing userfaultfd ioctls");
 
+	uffd_args->uffd_mode = uffd_mode;
 	uffd_args->uffd = uffd;
 	uffd_args->pipefd = pipefd;
 	uffd_args->delay = uffd_delay;
@@ -238,7 +274,7 @@ static void setup_demand_paging(struct kvm_vm *vm,
 }
 
 struct test_params {
-	bool use_uffd;
+	int uffd_mode;
 	useconds_t uffd_delay;
 	enum vm_mem_backing_src_type src_type;
 	bool partition_vcpu_memory_access;
@@ -275,7 +311,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 	perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
 			      p->partition_vcpu_memory_access);
 
-	if (p->use_uffd) {
+	if (p->uffd_mode) {
 		uffd_handler_threads =
 			malloc(nr_vcpus * sizeof(*uffd_handler_threads));
 		TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
@@ -289,6 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 		for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
 			vm_paddr_t vcpu_gpa;
 			void *vcpu_hva;
+			void *vcpu_alias;
 			uint64_t vcpu_mem_size;
 
 
@@ -303,8 +340,9 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 			PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
 				       vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size);
 
-			/* Cache the HVA pointer of the region */
+			/* Cache the host addresses of the region */
 			vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
+			vcpu_alias = addr_gpa2alias(vm, vcpu_gpa);
 
 			/*
 			 * Set up user fault fd to handle demand paging
@@ -315,8 +353,9 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 			TEST_ASSERT(!r, "Failed to set up pipefd");
 
 			setup_demand_paging(vm, &uffd_handler_threads[vcpu_id],
-					    pipefds[vcpu_id * 2], p->uffd_delay,
-					    &uffd_args[vcpu_id], vcpu_hva,
+					    pipefds[vcpu_id * 2], p->uffd_mode,
+					    p->uffd_delay, &uffd_args[vcpu_id],
+					    vcpu_hva, vcpu_alias,
 					    vcpu_mem_size);
 		}
 	}
@@ -345,7 +384,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 	pr_info("All vCPU threads joined\n");
 
-	if (p->use_uffd) {
+	if (p->uffd_mode) {
 		char c;
 
 		/* Tell the user fault fd handler threads to quit */
@@ -367,7 +406,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 	free(guest_data_prototype);
 	free(vcpu_threads);
-	if (p->use_uffd) {
+	if (p->uffd_mode) {
 		free(uffd_handler_threads);
 		free(uffd_args);
 		free(pipefds);
@@ -377,11 +416,11 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 static void help(char *name)
 {
 	puts("");
-	printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
+	printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n"
 	       "          [-b memory] [-t type] [-v vcpus] [-o]\n", name);
 	guest_modes_help();
-	printf(" -u: use User Fault FD to handle vCPU page\n"
-	       "     faults.\n");
+	printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n"
+	       "     UFFD registration mode: 'MISSING' or 'MINOR'.\n");
 	printf(" -d: add a delay in usec to the User Fault\n"
 	       "     FD handler to simulate demand paging\n"
 	       "     overheads. Ignored without -u.\n");
@@ -408,13 +447,17 @@ int main(int argc, char *argv[])
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "hm:ud:b:t:v:o")) != -1) {
+	while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) {
 		switch (opt) {
 		case 'm':
 			guest_modes_cmdline(optarg);
 			break;
 		case 'u':
-			p.use_uffd = true;
+			if (!strcmp("MISSING", optarg))
+				p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
+			else if (!strcmp("MINOR", optarg))
+				p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR;
+			TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'.");
 			break;
 		case 'd':
 			p.uffd_delay = strtoul(optarg, NULL, 0);
@@ -441,6 +484,9 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	TEST_ASSERT(p.uffd_mode != UFFDIO_REGISTER_MODE_MINOR || p.src_type == VM_MEM_SRC_SHMEM,
+		    "userfaultfd MINOR mode requires shared memory; pick a different -t");
+
 	for_each_guest_mode(run_test, &p);
 
 	return 0;
-- 
cgit v1.2.3


From 33090a884da5e9760f11441ac269f754375f80f5 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 19 May 2021 13:03:39 -0700
Subject: KVM: selftests: add shared hugetlbfs backing source type

This lets us run the demand paging test on top of a shared
hugetlbfs-backed area. The "shared" is key, as this allows us to
exercise userfaultfd minor faults on hugetlbfs.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Message-Id: <20210519200339.829146-11-axelrasmussen@google.com>
Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/demand_paging_test.c |  6 ++++--
 tools/testing/selftests/kvm/include/test_util.h  | 11 +++++++++++
 tools/testing/selftests/kvm/lib/kvm_util.c       |  9 +++++++--
 tools/testing/selftests/kvm/lib/test_util.c      | 11 +++++++++++
 4 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index fcba527c29a6..b74704305835 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -484,8 +484,10 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	TEST_ASSERT(p.uffd_mode != UFFDIO_REGISTER_MODE_MINOR || p.src_type == VM_MEM_SRC_SHMEM,
-		    "userfaultfd MINOR mode requires shared memory; pick a different -t");
+	if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR &&
+	    !backing_src_is_shared(p.src_type)) {
+		TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t");
+	}
 
 	for_each_guest_mode(run_test, &p);
 
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 7377f00469ef..d79be15dd3d2 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -17,6 +17,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/mman.h>
 #include "kselftest.h"
 
 static inline int _no_printf(const char *format, ...) { return 0; }
@@ -85,6 +86,7 @@ enum vm_mem_backing_src_type {
 	VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
 	VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
 	VM_MEM_SRC_SHMEM,
+	VM_MEM_SRC_SHARED_HUGETLB,
 	NUM_SRC_TYPES,
 };
 
@@ -101,4 +103,13 @@ size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
+/*
+ * Whether or not the given source type is shared memory (as opposed to
+ * anonymous).
+ */
+static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t)
+{
+	return vm_mem_backing_src_alias(t)->flag & MAP_SHARED;
+}
+
 #endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index f62780719176..28e528c19d28 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -848,8 +848,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 		region->mmap_size += alignment;
 
 	region->fd = -1;
-	if (src_type == VM_MEM_SRC_SHMEM) {
-		region->fd = memfd_create("kvm_selftest", MFD_CLOEXEC);
+	if (backing_src_is_shared(src_type)) {
+		int memfd_flags = MFD_CLOEXEC;
+
+		if (src_type == VM_MEM_SRC_SHARED_HUGETLB)
+			memfd_flags |= MFD_HUGETLB;
+
+		region->fd = memfd_create("kvm_selftest", memfd_flags);
 		TEST_ASSERT(region->fd != -1,
 			    "memfd_create failed, errno: %i", errno);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index c7a265da5090..6ad6c8276b2e 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -240,6 +240,16 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
 			.name = "shmem",
 			.flag = MAP_SHARED,
 		},
+		[VM_MEM_SRC_SHARED_HUGETLB] = {
+			.name = "shared_hugetlb",
+			/*
+			 * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since
+			 * we're using "file backed" memory, we need to specify
+			 * this when the FD is created, not when the area is
+			 * mapped.
+			 */
+			.flag = MAP_SHARED,
+		},
 	};
 	_Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
 		       "Missing new backing src types?");
@@ -262,6 +272,7 @@ size_t get_backing_src_pagesz(uint32_t i)
 	case VM_MEM_SRC_ANONYMOUS_THP:
 		return get_trans_hugepagesz();
 	case VM_MEM_SRC_ANONYMOUS_HUGETLB:
+	case VM_MEM_SRC_SHARED_HUGETLB:
 		return get_def_hugetlb_pagesz();
 	default:
 		return MAP_HUGE_PAGE_SIZE(flag);
-- 
cgit v1.2.3


From fb1070d18edb37daf3979662975bc54625a19953 Mon Sep 17 00:00:00 2001
From: Joe Richey <joerichey@google.com>
Date: Fri, 21 May 2021 01:58:43 -0700
Subject: KVM: X86: Use _BITUL() macro in UAPI headers

Replace BIT() in KVM's UPAI header with _BITUL(). BIT() is not defined
in the UAPI headers and its usage may cause userspace build errors.

Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking")
Signed-off-by: Joe Richey <joerichey@google.com>
Message-Id: <20210521085849.37676-3-joerichey94@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/uapi/linux/kvm.h       | 5 +++--
 tools/include/uapi/linux/kvm.h | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3fd9a7e9d90c..79d9c44d1ad7 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -8,6 +8,7 @@
  * Note: you must update KVM_API_VERSION if you change this interface.
  */
 
+#include <linux/const.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
@@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd {
  * conversion after harvesting an entry.  Also, it must not skip any
  * dirty bits, so that dirty bits are always harvested in sequence.
  */
-#define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
-#define KVM_DIRTY_GFN_F_RESET           BIT(1)
+#define KVM_DIRTY_GFN_F_DIRTY           _BITUL(0)
+#define KVM_DIRTY_GFN_F_RESET           _BITUL(1)
 #define KVM_DIRTY_GFN_F_MASK            0x3
 
 /*
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index f6afee209620..26e6d94d64ed 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -8,6 +8,7 @@
  * Note: you must update KVM_API_VERSION if you change this interface.
  */
 
+#include <linux/const.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
@@ -1834,8 +1835,8 @@ struct kvm_hyperv_eventfd {
  * conversion after harvesting an entry.  Also, it must not skip any
  * dirty bits, so that dirty bits are always harvested in sequence.
  */
-#define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
-#define KVM_DIRTY_GFN_F_RESET           BIT(1)
+#define KVM_DIRTY_GFN_F_DIRTY           _BITUL(0)
+#define KVM_DIRTY_GFN_F_RESET           _BITUL(1)
 #define KVM_DIRTY_GFN_F_MASK            0x3
 
 /*
-- 
cgit v1.2.3


From fb0f94794bb7558c078ce37b1a6e30d881fd7888 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 May 2021 14:36:14 -0400
Subject: selftests: kvm: do only 1 memslot_perf_test run by default

The test takes a long time with the current implementation of
memslots, so cut the run time a bit.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/memslot_perf_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 4ae0e5ec0f74..11239652d805 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -992,7 +992,7 @@ int main(int argc, char *argv[])
 		.tlast = NTESTS - 1,
 		.nslots = -1,
 		.seconds = 5,
-		.runs = 20,
+		.runs = 1,
 	};
 	struct test_result rbestslottime;
 	int tctr;
-- 
cgit v1.2.3


From c59870e2110e1229a6e4b2457aece6ffe8d68d99 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 19 May 2021 09:44:47 -0700
Subject: perf debug: Move debug initialization earlier

This avoids segfaults during option handlers that use pr_err. For
example, "perf --debug nopager list" segfaults before this change.

Fixes: 8abceacff87d (perf debug: Add debug_set_file function)
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20210519164447.2672030-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/perf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 20cb91ef06ff..2f6b67189b42 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -443,6 +443,8 @@ int main(int argc, const char **argv)
 	const char *cmd;
 	char sbuf[STRERR_BUFSIZE];
 
+	perf_debug_setup();
+
 	/* libsubcmd init */
 	exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
 	pager_init(PERF_PAGER_ENVIRONMENT);
@@ -531,8 +533,6 @@ int main(int argc, const char **argv)
 	 */
 	pthread__block_sigwinch();
 
-	perf_debug_setup();
-
 	while (1) {
 		static int done_help;
 
-- 
cgit v1.2.3


From c673b7f59e940061467200f1746820a178444bd0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 27 May 2021 15:00:52 -0700
Subject: perf stat: Fix error check for bpf_program__attach

It seems the bpf_program__attach() returns a negative error code instead
of a NULL pointer in case of error.

Fixes: 7fac83aaf2ee ("perf stat: Introduce 'bperf' to share hardware PMCs with BPF")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Link: http://lore.kernel.org/lkml/20210527220052.1657578-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index ddb52f748c8e..974f10e356f0 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -451,10 +451,10 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
 		goto out;
 	}
 
-	err = -1;
 	link = bpf_program__attach(skel->progs.on_switch);
-	if (!link) {
+	if (IS_ERR(link)) {
 		pr_err("Failed to attach leader program\n");
+		err = PTR_ERR(link);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 8fc4e4aa2bfca8d32e8bc2a01526ea2da450e6cb Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Tue, 25 May 2021 12:07:23 +0530
Subject: perf vendor events powerpc: Fix eventcode of power10 JSON events

Fixed the eventcode values in the power10 JSON event files to prepend
"0x" since these are hexadecimal values.

The patch also changes the event description of the PM_EXEC_STALL_LOAD_FINISH
and PM_EXEC_STALL_NTC_FLUSH event and move some events to correct files.

Fixes: 32daa5d7899e ("perf vendor events: Initial JSON/events list for power10 platform")
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Paul A. Clarke <pc@us.ibm.com>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Cc: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lore.kernel.org/lkml/20210525063723.1191514-1-kjain@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../pmu-events/arch/powerpc/power10/cache.json     |  30 +++--
 .../arch/powerpc/power10/floating_point.json       |   2 +-
 .../pmu-events/arch/powerpc/power10/frontend.json  | 124 ++++++++++++-------
 .../pmu-events/arch/powerpc/power10/locks.json     |   4 +-
 .../pmu-events/arch/powerpc/power10/marked.json    |  61 +++++-----
 .../pmu-events/arch/powerpc/power10/memory.json    |  79 ++++++------
 .../pmu-events/arch/powerpc/power10/others.json    | 133 +++++++++-----------
 .../pmu-events/arch/powerpc/power10/pipeline.json  | 135 ++++++++++-----------
 .../perf/pmu-events/arch/powerpc/power10/pmc.json  |   8 +-
 .../arch/powerpc/power10/translation.json          |  22 ++--
 10 files changed, 299 insertions(+), 299 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
index 616f29098c71..605be14f441c 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/cache.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
@@ -1,46 +1,56 @@
 [
   {
-    "EventCode": "1003C",
+    "EventCode": "0x1003C",
     "EventName": "PM_EXEC_STALL_DMISS_L2L3",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from either the local L2 or local L3."
   },
   {
-    "EventCode": "34056",
+    "EventCode": "0x1E054",
+    "EventName": "PM_EXEC_STALL_DMISS_L21_L31",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip."
+  },
+  {
+    "EventCode": "0x34054",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict."
+  },
+  {
+    "EventCode": "0x34056",
     "EventName": "PM_EXEC_STALL_LOAD_FINISH",
-    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ."
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ; cycles in which the NTF instruction is waiting for a data reload for a load miss, but the data comes back with a non-NTF instruction."
   },
   {
-    "EventCode": "3006C",
+    "EventCode": "0x3006C",
     "EventName": "PM_RUN_CYC_SMT2_MODE",
     "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT2 mode."
   },
   {
-    "EventCode": "300F4",
+    "EventCode": "0x300F4",
     "EventName": "PM_RUN_INST_CMPL_CONC",
     "BriefDescription": "PowerPC instructions completed by this thread when all threads in the core had the run-latch set."
   },
   {
-    "EventCode": "4C016",
+    "EventCode": "0x4C016",
     "EventName": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, with a dispatch conflict."
   },
   {
-    "EventCode": "4D014",
+    "EventCode": "0x4D014",
     "EventName": "PM_EXEC_STALL_LOAD",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a load instruction executing in the Load Store Unit."
   },
   {
-    "EventCode": "4D016",
+    "EventCode": "0x4D016",
     "EventName": "PM_EXEC_STALL_PTESYNC",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a PTESYNC instruction executing in the Load Store Unit."
   },
   {
-    "EventCode": "401EA",
+    "EventCode": "0x401EA",
     "EventName": "PM_THRESH_EXC_128",
     "BriefDescription": "Threshold counter exceeded a value of 128."
   },
   {
-    "EventCode": "400F6",
+    "EventCode": "0x400F6",
     "EventName": "PM_BR_MPRED_CMPL",
     "BriefDescription": "A mispredicted branch completed. Includes direction and target."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json
index 703cd431ae5b..54acb55e2c8c 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json
@@ -1,6 +1,6 @@
 [
   {
-    "EventCode": "4016E",
+    "EventCode": "0x4016E",
     "EventName": "PM_THRESH_NOT_MET",
     "BriefDescription": "Threshold counter did not meet threshold."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
index eac8609dcc90..558f9530f54e 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
@@ -1,216 +1,246 @@
 [
   {
-    "EventCode": "10004",
+    "EventCode": "0x10004",
     "EventName": "PM_EXEC_STALL_TRANSLATION",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss or ERAT miss and waited for it to resolve."
   },
   {
-    "EventCode": "10010",
+    "EventCode": "0x10006",
+    "EventName": "PM_DISP_STALL_HELD_OTHER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason."
+  },
+  {
+    "EventCode": "0x10010",
     "EventName": "PM_PMC4_OVERFLOW",
     "BriefDescription": "The event selected for PMC4 caused the event counter to overflow."
   },
   {
-    "EventCode": "10020",
+    "EventCode": "0x10020",
     "EventName": "PM_PMC4_REWIND",
     "BriefDescription": "The speculative event selected for PMC4 rewinds and the counter for PMC4 is not charged."
   },
   {
-    "EventCode": "10038",
+    "EventCode": "0x10038",
     "EventName": "PM_DISP_STALL_TRANSLATION",
     "BriefDescription": "Cycles when dispatch was stalled for this thread because the MMU was handling a translation miss."
   },
   {
-    "EventCode": "1003A",
+    "EventCode": "0x1003A",
     "EventName": "PM_DISP_STALL_BR_MPRED_IC_L2",
     "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2 after suffering a branch mispredict."
   },
   {
-    "EventCode": "1E050",
+    "EventCode": "0x1D05E",
+    "EventName": "PM_DISP_STALL_HELD_HALT_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management."
+  },
+  {
+    "EventCode": "0x1E050",
     "EventName": "PM_DISP_STALL_HELD_STF_MAPPER_CYC",
     "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the STF mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR."
   },
   {
-    "EventCode": "1F054",
+    "EventCode": "0x1F054",
     "EventName": "PM_DTLB_HIT",
     "BriefDescription": "The PTE required by the instruction was resident in the TLB (data TLB access). When MMCR1[16]=0 this event counts only demand hits. When MMCR1[16]=1 this event includes demand and prefetch. Applies to both HPT and RPT."
   },
   {
-    "EventCode": "101E8",
+    "EventCode": "0x10064",
+    "EventName": "PM_DISP_STALL_IC_L2",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2."
+  },
+  {
+    "EventCode": "0x101E8",
     "EventName": "PM_THRESH_EXC_256",
     "BriefDescription": "Threshold counter exceeded a count of 256."
   },
   {
-    "EventCode": "101EC",
+    "EventCode": "0x101EC",
     "EventName": "PM_THRESH_MET",
     "BriefDescription": "Threshold exceeded."
   },
   {
-    "EventCode": "100F2",
+    "EventCode": "0x100F2",
     "EventName": "PM_1PLUS_PPC_CMPL",
     "BriefDescription": "Cycles in which at least one instruction is completed by this thread."
   },
   {
-    "EventCode": "100F6",
+    "EventCode": "0x100F6",
     "EventName": "PM_IERAT_MISS",
     "BriefDescription": "IERAT Reloaded to satisfy an IERAT miss. All page sizes are counted by this event."
   },
   {
-    "EventCode": "100F8",
+    "EventCode": "0x100F8",
     "EventName": "PM_DISP_STALL_CYC",
     "BriefDescription": "Cycles the ICT has no itags assigned to this thread (no instructions were dispatched during these cycles)."
   },
   {
-    "EventCode": "20114",
+    "EventCode": "0x20006",
+    "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue."
+  },
+  {
+    "EventCode": "0x20114",
     "EventName": "PM_MRK_L2_RC_DISP",
     "BriefDescription": "Marked instruction RC dispatched in L2."
   },
   {
-    "EventCode": "2C010",
+    "EventCode": "0x2C010",
     "EventName": "PM_EXEC_STALL_LSU",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Load Store Unit. This does not include simple fixed point instructions."
   },
   {
-    "EventCode": "2C016",
+    "EventCode": "0x2C016",
     "EventName": "PM_DISP_STALL_IERAT_ONLY_MISS",
     "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction ERAT miss."
   },
   {
-    "EventCode": "2C01E",
+    "EventCode": "0x2C01E",
     "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3",
     "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3 after suffering a branch mispredict."
   },
   {
-    "EventCode": "2D01A",
+    "EventCode": "0x2D01A",
     "EventName": "PM_DISP_STALL_IC_MISS",
     "BriefDescription": "Cycles when dispatch was stalled for this thread due to an Icache Miss."
   },
   {
-    "EventCode": "2D01C",
-    "EventName": "PM_CMPL_STALL_STCX",
-    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing."
-  },
-  {
-    "EventCode": "2E018",
+    "EventCode": "0x2E018",
     "EventName": "PM_DISP_STALL_FETCH",
     "BriefDescription": "Cycles when dispatch was stalled for this thread because Fetch was being held."
   },
   {
-    "EventCode": "2E01A",
+    "EventCode": "0x2E01A",
     "EventName": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC",
     "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the XVFC mapper/SRB was full."
   },
   {
-    "EventCode": "2C142",
+    "EventCode": "0x2C142",
     "EventName": "PM_MRK_XFER_FROM_SRC_PMC2",
     "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "24050",
+    "EventCode": "0x24050",
     "EventName": "PM_IOPS_DISP",
     "BriefDescription": "Internal Operations dispatched. PM_IOPS_DISP / PM_INST_DISP will show the average number of internal operations per PowerPC instruction."
   },
   {
-    "EventCode": "2405E",
+    "EventCode": "0x2405E",
     "EventName": "PM_ISSUE_CANCEL",
     "BriefDescription": "An instruction issued and the issue was later cancelled. Only one cancel per PowerPC instruction."
   },
   {
-    "EventCode": "200FA",
+    "EventCode": "0x200FA",
     "EventName": "PM_BR_TAKEN_CMPL",
     "BriefDescription": "Branch Taken instruction completed."
   },
   {
-    "EventCode": "30012",
+    "EventCode": "0x30004",
+    "EventName": "PM_DISP_STALL_FLUSH",
+    "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC."
+  },
+  {
+    "EventCode": "0x3000A",
+    "EventName": "PM_DISP_STALL_ITLB_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss."
+  },
+  {
+    "EventCode": "0x30012",
     "EventName": "PM_FLUSH_COMPLETION",
     "BriefDescription": "The instruction that was next to complete (oldest in the pipeline) did not complete because it suffered a flush."
   },
   {
-    "EventCode": "30014",
+    "EventCode": "0x30014",
     "EventName": "PM_EXEC_STALL_STORE",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store instruction executing in the Load Store Unit."
   },
   {
-    "EventCode": "30018",
+    "EventCode": "0x30018",
     "EventName": "PM_DISP_STALL_HELD_SCOREBOARD_CYC",
     "BriefDescription": "Cycles in which the NTC instruction is held at dispatch while waiting on the Scoreboard. This event combines VSCR and FPSCR together."
   },
   {
-    "EventCode": "30026",
+    "EventCode": "0x30026",
     "EventName": "PM_EXEC_STALL_STORE_MISS",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store whose cache line was not resident in the L1 and was waiting for allocation of the missing line into the L1."
   },
   {
-    "EventCode": "3012A",
+    "EventCode": "0x3012A",
     "EventName": "PM_MRK_L2_RC_DONE",
     "BriefDescription": "L2 RC machine completed the transaction for the marked instruction."
   },
   {
-    "EventCode": "3F046",
+    "EventCode": "0x3F046",
     "EventName": "PM_ITLB_HIT_1G",
     "BriefDescription": "Instruction TLB hit (IERAT reload) page size 1G, which implies Radix Page Table translation is in use. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "34058",
+    "EventCode": "0x34058",
     "EventName": "PM_DISP_STALL_BR_MPRED_ICMISS",
     "BriefDescription": "Cycles when dispatch was stalled after a mispredicted branch resulted in an instruction cache miss."
   },
   {
-    "EventCode": "3D05C",
+    "EventCode": "0x3D05C",
     "EventName": "PM_DISP_STALL_HELD_RENAME_CYC",
     "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR and XVFC."
   },
   {
-    "EventCode": "3E052",
+    "EventCode": "0x3E052",
     "EventName": "PM_DISP_STALL_IC_L3",
     "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3."
   },
   {
-    "EventCode": "3E054",
+    "EventCode": "0x3E054",
     "EventName": "PM_LD_MISS_L1",
     "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
   },
   {
-    "EventCode": "301EA",
+    "EventCode": "0x301EA",
     "EventName": "PM_THRESH_EXC_1024",
     "BriefDescription": "Threshold counter exceeded a value of 1024."
   },
   {
-    "EventCode": "300FA",
+    "EventCode": "0x300FA",
     "EventName": "PM_INST_FROM_L3MISS",
     "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss."
   },
   {
-    "EventCode": "40006",
+    "EventCode": "0x40006",
     "EventName": "PM_ISSUE_KILL",
     "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group."
   },
   {
-    "EventCode": "40116",
+    "EventCode": "0x40116",
     "EventName": "PM_MRK_LARX_FIN",
     "BriefDescription": "Marked load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock."
   },
   {
-    "EventCode": "4C010",
+    "EventCode": "0x4C010",
     "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3MISS",
     "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from sources beyond the local L3 after suffering a mispredicted branch."
   },
   {
-    "EventCode": "4D01E",
+    "EventCode": "0x4D01E",
     "EventName": "PM_DISP_STALL_BR_MPRED",
     "BriefDescription": "Cycles when dispatch was stalled for this thread due to a mispredicted branch."
   },
   {
-    "EventCode": "4E010",
+    "EventCode": "0x4E010",
     "EventName": "PM_DISP_STALL_IC_L3MISS",
     "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from any source beyond the local L3."
   },
   {
-    "EventCode": "4E01A",
+    "EventCode": "0x4E01A",
     "EventName": "PM_DISP_STALL_HELD_CYC",
     "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any reason."
   },
   {
-    "EventCode": "44056",
+    "EventCode": "0x4003C",
+    "EventName": "PM_DISP_STALL_HELD_SYNC_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch."
+  },
+  {
+    "EventCode": "0x44056",
     "EventName": "PM_VECTOR_ST_CMPL",
     "BriefDescription": "Vector store instructions completed."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
index 016d8de0e14a..b5a0d6521963 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/locks.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
@@ -1,11 +1,11 @@
 [
   {
-    "EventCode": "1E058",
+    "EventCode": "0x1E058",
     "EventName": "PM_STCX_FAIL_FIN",
     "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
   },
   {
-    "EventCode": "4E050",
+    "EventCode": "0x4E050",
     "EventName": "PM_STCX_PASS_FIN",
     "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/marked.json b/tools/perf/pmu-events/arch/powerpc/power10/marked.json
index 93a5a5910648..58b5dfe3a273 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/marked.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/marked.json
@@ -1,146 +1,141 @@
 [
   {
-    "EventCode": "1002C",
+    "EventCode": "0x1002C",
     "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS",
     "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request."
   },
   {
-    "EventCode": "10132",
+    "EventCode": "0x10132",
     "EventName": "PM_MRK_INST_ISSUED",
     "BriefDescription": "Marked instruction issued. Note that stores always get issued twice, the address gets issued to the LSU and the data gets issued to the VSU. Also, issues can sometimes get killed/cancelled and cause multiple sequential issues for the same instruction."
   },
   {
-    "EventCode": "101E0",
+    "EventCode": "0x101E0",
     "EventName": "PM_MRK_INST_DISP",
     "BriefDescription": "The thread has dispatched a randomly sampled marked instruction."
   },
   {
-    "EventCode": "101E2",
+    "EventCode": "0x101E2",
     "EventName": "PM_MRK_BR_TAKEN_CMPL",
     "BriefDescription": "Marked Branch Taken instruction completed."
   },
   {
-    "EventCode": "20112",
+    "EventCode": "0x20112",
     "EventName": "PM_MRK_NTF_FIN",
     "BriefDescription": "The marked instruction became the oldest in the pipeline before it finished. It excludes instructions that finish at dispatch."
   },
   {
-    "EventCode": "2C01C",
+    "EventCode": "0x2C01C",
     "EventName": "PM_EXEC_STALL_DMISS_OFF_CHIP",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a remote chip."
   },
   {
-    "EventCode": "20138",
+    "EventCode": "0x20138",
     "EventName": "PM_MRK_ST_NEST",
     "BriefDescription": "A store has been sampled/marked and is at the point of execution where it has completed in the core and can no longer be flushed. At this point the store is sent to the L2."
   },
   {
-    "EventCode": "2013A",
+    "EventCode": "0x2013A",
     "EventName": "PM_MRK_BRU_FIN",
     "BriefDescription": "Marked Branch instruction finished."
   },
   {
-    "EventCode": "2C144",
+    "EventCode": "0x2C144",
     "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC2",
     "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[15:27]."
   },
   {
-    "EventCode": "24156",
+    "EventCode": "0x24156",
     "EventName": "PM_MRK_STCX_FIN",
     "BriefDescription": "Marked conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
   },
   {
-    "EventCode": "24158",
+    "EventCode": "0x24158",
     "EventName": "PM_MRK_INST",
     "BriefDescription": "An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens."
   },
   {
-    "EventCode": "2415C",
+    "EventCode": "0x2415C",
     "EventName": "PM_MRK_BR_CMPL",
     "BriefDescription": "A marked branch completed. All branches are included."
   },
   {
-    "EventCode": "200FD",
+    "EventCode": "0x200FD",
     "EventName": "PM_L1_ICACHE_MISS",
     "BriefDescription": "Demand iCache Miss."
   },
   {
-    "EventCode": "30130",
+    "EventCode": "0x30130",
     "EventName": "PM_MRK_INST_FIN",
     "BriefDescription": "marked instruction finished. Excludes instructions that finish at dispatch. Note that stores always finish twice since the address gets issued to the LSU and the data gets issued to the VSU."
   },
   {
-    "EventCode": "34146",
+    "EventCode": "0x34146",
     "EventName": "PM_MRK_LD_CMPL",
     "BriefDescription": "Marked loads completed."
   },
   {
-    "EventCode": "3E158",
+    "EventCode": "0x3E158",
     "EventName": "PM_MRK_STCX_FAIL",
     "BriefDescription": "Marked conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
   },
   {
-    "EventCode": "3E15A",
+    "EventCode": "0x3E15A",
     "EventName": "PM_MRK_ST_FIN",
     "BriefDescription": "The marked instruction was a store of any kind."
   },
   {
-    "EventCode": "30068",
+    "EventCode": "0x30068",
     "EventName": "PM_L1_ICACHE_RELOADED_PREF",
     "BriefDescription": "Counts all Icache prefetch reloads ( includes demand turned into prefetch)."
   },
   {
-    "EventCode": "301E4",
+    "EventCode": "0x301E4",
     "EventName": "PM_MRK_BR_MPRED_CMPL",
     "BriefDescription": "Marked Branch Mispredicted. Includes direction and target."
   },
   {
-    "EventCode": "300F6",
+    "EventCode": "0x300F6",
     "EventName": "PM_LD_DEMAND_MISS_L1",
     "BriefDescription": "The L1 cache was reloaded with a line that fulfills a demand miss request. Counted at reload time, before finish."
   },
   {
-    "EventCode": "300FE",
+    "EventCode": "0x300FE",
     "EventName": "PM_DATA_FROM_L3MISS",
     "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss."
   },
   {
-    "EventCode": "40012",
+    "EventCode": "0x40012",
     "EventName": "PM_L1_ICACHE_RELOADED_ALL",
     "BriefDescription": "Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch."
   },
   {
-    "EventCode": "40134",
+    "EventCode": "0x40134",
     "EventName": "PM_MRK_INST_TIMEO",
     "BriefDescription": "Marked instruction finish timeout (instruction was lost)."
   },
   {
-    "EventCode": "4003C",
-    "EventName": "PM_DISP_STALL_HELD_SYNC_CYC",
-    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch."
-  },
-  {
-    "EventCode": "4505A",
+    "EventCode": "0x4505A",
     "EventName": "PM_SP_FLOP_CMPL",
     "BriefDescription": "Single Precision floating point instructions completed."
   },
   {
-    "EventCode": "4D058",
+    "EventCode": "0x4D058",
     "EventName": "PM_VECTOR_FLOP_CMPL",
     "BriefDescription": "Vector floating point instructions completed."
   },
   {
-    "EventCode": "4D05A",
+    "EventCode": "0x4D05A",
     "EventName": "PM_NON_MATH_FLOP_CMPL",
     "BriefDescription": "Non Math instructions completed."
   },
   {
-    "EventCode": "401E0",
+    "EventCode": "0x401E0",
     "EventName": "PM_MRK_INST_CMPL",
     "BriefDescription": "marked instruction completed."
   },
   {
-    "EventCode": "400FE",
+    "EventCode": "0x400FE",
     "EventName": "PM_DATA_FROM_MEMORY",
     "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
index b01141eeebee..843b51f531e9 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/memory.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
@@ -1,191 +1,186 @@
 [
   {
-    "EventCode": "1000A",
+    "EventCode": "0x1000A",
     "EventName": "PM_PMC3_REWIND",
     "BriefDescription": "The speculative event selected for PMC3 rewinds and the counter for PMC3 is not charged."
   },
   {
-    "EventCode": "1C040",
+    "EventCode": "0x1C040",
     "EventName": "PM_XFER_FROM_SRC_PMC1",
     "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "1C142",
+    "EventCode": "0x1C142",
     "EventName": "PM_MRK_XFER_FROM_SRC_PMC1",
     "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "1C144",
+    "EventCode": "0x1C144",
     "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC1",
     "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[0:12]."
   },
   {
-    "EventCode": "1C056",
+    "EventCode": "0x1C056",
     "EventName": "PM_DERAT_MISS_4K",
     "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 4K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "1C058",
+    "EventCode": "0x1C058",
     "EventName": "PM_DTLB_MISS_16G",
     "BriefDescription": "Data TLB reload (after a miss) page size 16G. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "1C05C",
+    "EventCode": "0x1C05C",
     "EventName": "PM_DTLB_MISS_2M",
     "BriefDescription": "Data TLB reload (after a miss) page size 2M. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "1E056",
+    "EventCode": "0x1E056",
     "EventName": "PM_EXEC_STALL_STORE_PIPE",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the store unit. This does not include cycles spent handling store misses, PTESYNC instructions or TLBIE instructions."
   },
   {
-    "EventCode": "1F150",
+    "EventCode": "0x1F150",
     "EventName": "PM_MRK_ST_L2_CYC",
     "BriefDescription": "Cycles from L2 RC dispatch to L2 RC completion."
   },
   {
-    "EventCode": "10062",
+    "EventCode": "0x10062",
     "EventName": "PM_LD_L3MISS_PEND_CYC",
     "BriefDescription": "Cycles L3 miss was pending for this thread."
   },
   {
-    "EventCode": "20010",
+    "EventCode": "0x20010",
     "EventName": "PM_PMC1_OVERFLOW",
     "BriefDescription": "The event selected for PMC1 caused the event counter to overflow."
   },
   {
-    "EventCode": "2001A",
+    "EventCode": "0x2001A",
     "EventName": "PM_ITLB_HIT",
     "BriefDescription": "The PTE required to translate the instruction address was resident in the TLB (instruction TLB access/IERAT reload). Applies to both HPT and RPT. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "2003E",
+    "EventCode": "0x2003E",
     "EventName": "PM_PTESYNC_FIN",
     "BriefDescription": "Ptesync instruction finished in the store unit. Only one ptesync can finish at a time."
   },
   {
-    "EventCode": "2C040",
+    "EventCode": "0x2C040",
     "EventName": "PM_XFER_FROM_SRC_PMC2",
     "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "2C054",
+    "EventCode": "0x2C054",
     "EventName": "PM_DERAT_MISS_64K",
     "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "2C056",
+    "EventCode": "0x2C056",
     "EventName": "PM_DTLB_MISS_4K",
     "BriefDescription": "Data TLB reload (after a miss) page size 4K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "2D154",
+    "EventCode": "0x2D154",
     "EventName": "PM_MRK_DERAT_MISS_64K",
     "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "200F6",
+    "EventCode": "0x200F6",
     "EventName": "PM_DERAT_MISS",
     "BriefDescription": "DERAT Reloaded to satisfy a DERAT miss. All page sizes are counted by this event. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "3000A",
-    "EventName": "PM_DISP_STALL_ITLB_MISS",
-    "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss."
-  },
-  {
-    "EventCode": "30016",
+    "EventCode": "0x30016",
     "EventName": "PM_EXEC_STALL_DERAT_DTLB_MISS",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss and waited for it resolve."
   },
   {
-    "EventCode": "3C040",
+    "EventCode": "0x3C040",
     "EventName": "PM_XFER_FROM_SRC_PMC3",
     "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "3C142",
+    "EventCode": "0x3C142",
     "EventName": "PM_MRK_XFER_FROM_SRC_PMC3",
     "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "3C144",
+    "EventCode": "0x3C144",
     "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC3",
     "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[30:42]."
   },
   {
-    "EventCode": "3C054",
+    "EventCode": "0x3C054",
     "EventName": "PM_DERAT_MISS_16M",
     "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16M. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "3C056",
+    "EventCode": "0x3C056",
     "EventName": "PM_DTLB_MISS_64K",
     "BriefDescription": "Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "3C058",
+    "EventCode": "0x3C058",
     "EventName": "PM_LARX_FIN",
     "BriefDescription": "Load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock."
   },
   {
-    "EventCode": "301E2",
+    "EventCode": "0x301E2",
     "EventName": "PM_MRK_ST_CMPL",
     "BriefDescription": "Marked store completed and sent to nest. Note that this count excludes cache-inhibited stores."
   },
   {
-    "EventCode": "300FC",
+    "EventCode": "0x300FC",
     "EventName": "PM_DTLB_MISS",
     "BriefDescription": "The DPTEG required for the load/store instruction in execution was missing from the TLB. It includes pages of all sizes for demand and prefetch activity."
   },
   {
-    "EventCode": "4D02C",
+    "EventCode": "0x4D02C",
     "EventName": "PM_PMC1_REWIND",
     "BriefDescription": "The speculative event selected for PMC1 rewinds and the counter for PMC1 is not charged."
   },
   {
-    "EventCode": "4003E",
+    "EventCode": "0x4003E",
     "EventName": "PM_LD_CMPL",
     "BriefDescription": "Loads completed."
   },
   {
-    "EventCode": "4C040",
+    "EventCode": "0x4C040",
     "EventName": "PM_XFER_FROM_SRC_PMC4",
     "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "4C142",
+    "EventCode": "0x4C142",
     "EventName": "PM_MRK_XFER_FROM_SRC_PMC4",
     "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
   },
   {
-    "EventCode": "4C144",
+    "EventCode": "0x4C144",
     "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC4",
     "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[45:57]."
   },
   {
-    "EventCode": "4C056",
+    "EventCode": "0x4C056",
     "EventName": "PM_DTLB_MISS_16M",
     "BriefDescription": "Data TLB reload (after a miss) page size 16M. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "4C05A",
+    "EventCode": "0x4C05A",
     "EventName": "PM_DTLB_MISS_1G",
     "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "4C15E",
+    "EventCode": "0x4C15E",
     "EventName": "PM_MRK_DTLB_MISS_64K",
     "BriefDescription": "Marked Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "4D056",
+    "EventCode": "0x4D056",
     "EventName": "PM_NON_FMA_FLOP_CMPL",
     "BriefDescription": "Non FMA instruction completed."
   },
   {
-    "EventCode": "40164",
+    "EventCode": "0x40164",
     "EventName": "PM_MRK_DERAT_MISS_2M",
     "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json
index a119e56cbf1c..7d0de1a2860b 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/others.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json
@@ -1,296 +1,271 @@
 [
   {
-    "EventCode": "10016",
+    "EventCode": "0x10016",
     "EventName": "PM_VSU0_ISSUE",
     "BriefDescription": "VSU instructions issued to VSU pipe 0."
   },
   {
-    "EventCode": "1001C",
+    "EventCode": "0x1001C",
     "EventName": "PM_ULTRAVISOR_INST_CMPL",
     "BriefDescription": "PowerPC instructions that completed while the thread was in ultravisor state."
   },
   {
-    "EventCode": "100F0",
+    "EventCode": "0x100F0",
     "EventName": "PM_CYC",
     "BriefDescription": "Processor cycles."
   },
   {
-    "EventCode": "10134",
+    "EventCode": "0x10134",
     "EventName": "PM_MRK_ST_DONE_L2",
     "BriefDescription": "Marked stores completed in L2 (RC machine done)."
   },
   {
-    "EventCode": "1505E",
+    "EventCode": "0x1505E",
     "EventName": "PM_LD_HIT_L1",
     "BriefDescription": "Loads that finished without experiencing an L1 miss."
   },
   {
-    "EventCode": "1D05E",
-    "EventName": "PM_DISP_STALL_HELD_HALT_CYC",
-    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management."
-  },
-  {
-    "EventCode": "1E054",
-    "EventName": "PM_EXEC_STALL_DMISS_L21_L31",
-    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip."
-  },
-  {
-    "EventCode": "1E05A",
-    "EventName": "PM_CMPL_STALL_LWSYNC",
-    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete."
-  },
-  {
-    "EventCode": "1F056",
+    "EventCode": "0x1F056",
     "EventName": "PM_DISP_SS0_2_INSTR_CYC",
     "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions."
   },
   {
-    "EventCode": "1F15C",
+    "EventCode": "0x1F15C",
     "EventName": "PM_MRK_STCX_L2_CYC",
     "BriefDescription": "Cycles spent in the nest portion of a marked Stcx instruction. It starts counting when the operation starts to drain to the L2 and it stops counting when the instruction retires from the Instruction Completion Table (ICT) in the Instruction Sequencing Unit (ISU)."
   },
   {
-    "EventCode": "10066",
+    "EventCode": "0x10066",
     "EventName": "PM_ADJUNCT_CYC",
     "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011."
   },
   {
-    "EventCode": "101E4",
+    "EventCode": "0x101E4",
     "EventName": "PM_MRK_L1_ICACHE_MISS",
     "BriefDescription": "Marked Instruction suffered an icache Miss."
   },
   {
-    "EventCode": "101EA",
+    "EventCode": "0x101EA",
     "EventName": "PM_MRK_L1_RELOAD_VALID",
     "BriefDescription": "Marked demand reload."
   },
   {
-    "EventCode": "100F4",
+    "EventCode": "0x100F4",
     "EventName": "PM_FLOP_CMPL",
     "BriefDescription": "Floating Point Operations Completed. Includes any type. It counts once for each 1, 2, 4 or 8 flop instruction. Use PM_1|2|4|8_FLOP_CMPL events to count flops."
   },
   {
-    "EventCode": "100FA",
+    "EventCode": "0x100FA",
     "EventName": "PM_RUN_LATCH_ANY_THREAD_CYC",
     "BriefDescription": "Cycles when at least one thread has the run latch set."
   },
   {
-    "EventCode": "100FC",
+    "EventCode": "0x100FC",
     "EventName": "PM_LD_REF_L1",
     "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included."
   },
   {
-    "EventCode": "20006",
-    "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC",
-    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue."
-  },
-  {
-    "EventCode": "2000C",
+    "EventCode": "0x2000C",
     "EventName": "PM_RUN_LATCH_ALL_THREADS_CYC",
     "BriefDescription": "Cycles when the run latch is set for all threads."
   },
   {
-    "EventCode": "2E010",
+    "EventCode": "0x2E010",
     "EventName": "PM_ADJUNCT_INST_CMPL",
     "BriefDescription": "PowerPC instructions that completed while the thread is in Adjunct state."
   },
   {
-    "EventCode": "2E014",
+    "EventCode": "0x2E014",
     "EventName": "PM_STCX_FIN",
     "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
   },
   {
-    "EventCode": "20130",
+    "EventCode": "0x20130",
     "EventName": "PM_MRK_INST_DECODED",
     "BriefDescription": "An instruction was marked at decode time. Random Instruction Sampling (RIS) only."
   },
   {
-    "EventCode": "20132",
+    "EventCode": "0x20132",
     "EventName": "PM_MRK_DFU_ISSUE",
     "BriefDescription": "The marked instruction was a decimal floating point operation issued to the VSU. Measured at issue time."
   },
   {
-    "EventCode": "20134",
+    "EventCode": "0x20134",
     "EventName": "PM_MRK_FXU_ISSUE",
     "BriefDescription": "The marked instruction was a fixed point operation issued to the VSU. Measured at issue time."
   },
   {
-    "EventCode": "2505C",
+    "EventCode": "0x2505C",
     "EventName": "PM_VSU_ISSUE",
     "BriefDescription": "At least one VSU instruction was issued to one of the VSU pipes. Up to 4 per cycle. Includes fixed point operations."
   },
   {
-    "EventCode": "2F054",
+    "EventCode": "0x2F054",
     "EventName": "PM_DISP_SS1_2_INSTR_CYC",
     "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions."
   },
   {
-    "EventCode": "2F056",
+    "EventCode": "0x2F056",
     "EventName": "PM_DISP_SS1_4_INSTR_CYC",
     "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions."
   },
   {
-    "EventCode": "2006C",
+    "EventCode": "0x2006C",
     "EventName": "PM_RUN_CYC_SMT4_MODE",
     "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT4 mode."
   },
   {
-    "EventCode": "201E0",
+    "EventCode": "0x201E0",
     "EventName": "PM_MRK_DATA_FROM_MEMORY",
     "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss for a marked load."
   },
   {
-    "EventCode": "201E4",
+    "EventCode": "0x201E4",
     "EventName": "PM_MRK_DATA_FROM_L3MISS",
     "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked load."
   },
   {
-    "EventCode": "201E8",
+    "EventCode": "0x201E8",
     "EventName": "PM_THRESH_EXC_512",
     "BriefDescription": "Threshold counter exceeded a value of 512."
   },
   {
-    "EventCode": "200F2",
+    "EventCode": "0x200F2",
     "EventName": "PM_INST_DISP",
     "BriefDescription": "PowerPC instructions dispatched."
   },
   {
-    "EventCode": "30132",
+    "EventCode": "0x30132",
     "EventName": "PM_MRK_VSU_FIN",
     "BriefDescription": "VSU marked instructions finished. Excludes simple FX instructions issued to the Store Unit."
   },
   {
-    "EventCode": "30038",
+    "EventCode": "0x30038",
     "EventName": "PM_EXEC_STALL_DMISS_LMEM",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local memory, local OpenCapp cache, or local OpenCapp memory."
   },
   {
-    "EventCode": "3F04A",
+    "EventCode": "0x3F04A",
     "EventName": "PM_LSU_ST5_FIN",
     "BriefDescription": "LSU Finished an internal operation in ST2 port."
   },
   {
-    "EventCode": "34054",
-    "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT",
-    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict."
-  },
-  {
-    "EventCode": "3405A",
+    "EventCode": "0x3405A",
     "EventName": "PM_PRIVILEGED_INST_CMPL",
     "BriefDescription": "PowerPC Instructions that completed while the thread is in Privileged state."
   },
   {
-    "EventCode": "3F150",
+    "EventCode": "0x3F150",
     "EventName": "PM_MRK_ST_DRAIN_CYC",
     "BriefDescription": "cycles to drain st from core to L2."
   },
   {
-    "EventCode": "3F054",
+    "EventCode": "0x3F054",
     "EventName": "PM_DISP_SS0_4_INSTR_CYC",
     "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions."
   },
   {
-    "EventCode": "3F056",
+    "EventCode": "0x3F056",
     "EventName": "PM_DISP_SS0_8_INSTR_CYC",
     "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions."
   },
   {
-    "EventCode": "30162",
+    "EventCode": "0x30162",
     "EventName": "PM_MRK_ISSUE_DEPENDENT_LOAD",
     "BriefDescription": "The marked instruction was dependent on a load. It is eligible for issue kill."
   },
   {
-    "EventCode": "40114",
+    "EventCode": "0x40114",
     "EventName": "PM_MRK_START_PROBE_NOP_DISP",
     "BriefDescription": "Marked Start probe nop dispatched. Instruction AND R0,R0,R0."
   },
   {
-    "EventCode": "4001C",
+    "EventCode": "0x4001C",
     "EventName": "PM_VSU_FIN",
     "BriefDescription": "VSU instructions finished."
   },
   {
-    "EventCode": "4C01A",
+    "EventCode": "0x4C01A",
     "EventName": "PM_EXEC_STALL_DMISS_OFF_NODE",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a distant chip."
   },
   {
-    "EventCode": "4D012",
+    "EventCode": "0x4D012",
     "EventName": "PM_PMC3_SAVED",
     "BriefDescription": "The conditions for the speculative event selected for PMC3 are met and PMC3 is charged."
   },
   {
-    "EventCode": "4D022",
+    "EventCode": "0x4D022",
     "EventName": "PM_HYPERVISOR_INST_CMPL",
     "BriefDescription": "PowerPC instructions that completed while the thread is in hypervisor state."
   },
   {
-    "EventCode": "4D026",
+    "EventCode": "0x4D026",
     "EventName": "PM_ULTRAVISOR_CYC",
     "BriefDescription": "Cycles when the thread is in Ultravisor state. MSR[S HV PR]=110."
   },
   {
-    "EventCode": "4D028",
+    "EventCode": "0x4D028",
     "EventName": "PM_PRIVILEGED_CYC",
     "BriefDescription": "Cycles when the thread is in Privileged state. MSR[S HV PR]=x00."
   },
   {
-    "EventCode": "40030",
+    "EventCode": "0x40030",
     "EventName": "PM_INST_FIN",
     "BriefDescription": "Instructions finished."
   },
   {
-    "EventCode": "44146",
+    "EventCode": "0x44146",
     "EventName": "PM_MRK_STCX_CORE_CYC",
     "BriefDescription": "Cycles spent in the core portion of a marked Stcx instruction. It starts counting when the instruction is decoded and stops counting when it drains into the L2."
   },
   {
-    "EventCode": "44054",
+    "EventCode": "0x44054",
     "EventName": "PM_VECTOR_LD_CMPL",
     "BriefDescription": "Vector load instructions completed."
   },
   {
-    "EventCode": "45054",
+    "EventCode": "0x45054",
     "EventName": "PM_FMA_CMPL",
     "BriefDescription": "Two floating point instructions completed (FMA class of instructions: fmadd, fnmadd, fmsub, fnmsub). Scalar instructions only."
   },
   {
-    "EventCode": "45056",
+    "EventCode": "0x45056",
     "EventName": "PM_SCALAR_FLOP_CMPL",
     "BriefDescription": "Scalar floating point instructions completed."
   },
   {
-    "EventCode": "4505C",
+    "EventCode": "0x4505C",
     "EventName": "PM_MATH_FLOP_CMPL",
     "BriefDescription": "Math floating point instructions completed."
   },
   {
-    "EventCode": "4D05E",
+    "EventCode": "0x4D05E",
     "EventName": "PM_BR_CMPL",
     "BriefDescription": "A branch completed. All branches are included."
   },
   {
-    "EventCode": "4E15E",
+    "EventCode": "0x4E15E",
     "EventName": "PM_MRK_INST_FLUSHED",
     "BriefDescription": "The marked instruction was flushed."
   },
   {
-    "EventCode": "401E6",
+    "EventCode": "0x401E6",
     "EventName": "PM_MRK_INST_FROM_L3MISS",
     "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked instruction."
   },
   {
-    "EventCode": "401E8",
+    "EventCode": "0x401E8",
     "EventName": "PM_MRK_DATA_FROM_L2MISS",
     "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss for a marked load."
   },
   {
-    "EventCode": "400F0",
+    "EventCode": "0x400F0",
     "EventName": "PM_LD_DEMAND_MISS_L1_FIN",
     "BriefDescription": "Load Missed L1, counted at finish time."
   },
   {
-    "EventCode": "400FA",
+    "EventCode": "0x400FA",
     "EventName": "PM_RUN_INST_CMPL",
     "BriefDescription": "Completed PowerPC instructions gated by the run latch."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
index b61b5cc157ee..b8aded6045fa 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
@@ -1,296 +1,291 @@
 [
   {
-    "EventCode": "100FE",
+    "EventCode": "0x100FE",
     "EventName": "PM_INST_CMPL",
     "BriefDescription": "PowerPC instructions completed."
   },
   {
-    "EventCode": "10006",
-    "EventName": "PM_DISP_STALL_HELD_OTHER_CYC",
-    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason."
-  },
-  {
-    "EventCode": "1000C",
+    "EventCode": "0x1000C",
     "EventName": "PM_LSU_LD0_FIN",
     "BriefDescription": "LSU Finished an internal operation in LD0 port."
   },
   {
-    "EventCode": "1000E",
+    "EventCode": "0x1000E",
     "EventName": "PM_MMA_ISSUED",
     "BriefDescription": "MMA instructions issued."
   },
   {
-    "EventCode": "10012",
+    "EventCode": "0x10012",
     "EventName": "PM_LSU_ST0_FIN",
     "BriefDescription": "LSU Finished an internal operation in ST0 port."
   },
   {
-    "EventCode": "10014",
+    "EventCode": "0x10014",
     "EventName": "PM_LSU_ST4_FIN",
     "BriefDescription": "LSU Finished an internal operation in ST4 port."
   },
   {
-    "EventCode": "10018",
+    "EventCode": "0x10018",
     "EventName": "PM_IC_DEMAND_CYC",
     "BriefDescription": "Cycles in which an instruction reload is pending to satisfy a demand miss."
   },
   {
-    "EventCode": "10022",
+    "EventCode": "0x10022",
     "EventName": "PM_PMC2_SAVED",
     "BriefDescription": "The conditions for the speculative event selected for PMC2 are met and PMC2 is charged."
   },
   {
-    "EventCode": "10024",
+    "EventCode": "0x10024",
     "EventName": "PM_PMC5_OVERFLOW",
     "BriefDescription": "The event selected for PMC5 caused the event counter to overflow."
   },
   {
-    "EventCode": "10058",
+    "EventCode": "0x10058",
     "EventName": "PM_EXEC_STALL_FIN_AT_DISP",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline finished at dispatch and did not require execution in the LSU, BRU or VSU."
   },
   {
-    "EventCode": "1005A",
+    "EventCode": "0x1005A",
     "EventName": "PM_FLUSH_MPRED",
     "BriefDescription": "A flush occurred due to a mispredicted branch. Includes target and direction."
   },
   {
-    "EventCode": "1C05A",
+    "EventCode": "0x1C05A",
     "EventName": "PM_DERAT_MISS_2M",
     "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
   },
   {
-    "EventCode": "10064",
-    "EventName": "PM_DISP_STALL_IC_L2",
-    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2."
+    "EventCode": "0x1E05A",
+    "EventName": "PM_CMPL_STALL_LWSYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete."
   },
   {
-    "EventCode": "10068",
+    "EventCode": "0x10068",
     "EventName": "PM_BR_FIN",
     "BriefDescription": "A branch instruction finished. Includes predicted/mispredicted/unconditional."
   },
   {
-    "EventCode": "1006A",
+    "EventCode": "0x1006A",
     "EventName": "PM_FX_LSU_FIN",
     "BriefDescription": "Simple fixed point instruction issued to the store unit. Measured at finish time."
   },
   {
-    "EventCode": "1006C",
+    "EventCode": "0x1006C",
     "EventName": "PM_RUN_CYC_ST_MODE",
     "BriefDescription": "Cycles when the run latch is set and the core is in ST mode."
   },
   {
-    "EventCode": "20004",
+    "EventCode": "0x20004",
     "EventName": "PM_ISSUE_STALL",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was dispatched but not issued yet."
   },
   {
-    "EventCode": "2000A",
+    "EventCode": "0x2000A",
     "EventName": "PM_HYPERVISOR_CYC",
     "BriefDescription": "Cycles when the thread is in Hypervisor state. MSR[S HV PR]=010."
   },
   {
-    "EventCode": "2000E",
+    "EventCode": "0x2000E",
     "EventName": "PM_LSU_LD1_FIN",
     "BriefDescription": "LSU Finished an internal operation in LD1 port."
   },
   {
-    "EventCode": "2C014",
+    "EventCode": "0x2C014",
     "EventName": "PM_CMPL_STALL_SPECIAL",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline required special handling before completing."
   },
   {
-    "EventCode": "2C018",
+    "EventCode": "0x2C018",
     "EventName": "PM_EXEC_STALL_DMISS_L3MISS",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a source beyond the local L2 or local L3."
   },
   {
-    "EventCode": "2D010",
+    "EventCode": "0x2D010",
     "EventName": "PM_LSU_ST1_FIN",
     "BriefDescription": "LSU Finished an internal operation in ST1 port."
   },
   {
-    "EventCode": "2D012",
+    "EventCode": "0x2D012",
     "EventName": "PM_VSU1_ISSUE",
     "BriefDescription": "VSU instructions issued to VSU pipe 1."
   },
   {
-    "EventCode": "2D018",
+    "EventCode": "0x2D018",
     "EventName": "PM_EXEC_STALL_VSU",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the VSU (includes FXU, VSU, CRU)."
   },
   {
-    "EventCode": "2E01E",
+    "EventCode": "0x2D01C",
+    "EventName": "PM_CMPL_STALL_STCX",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing."
+  },
+  {
+    "EventCode": "0x2E01E",
     "EventName": "PM_EXEC_STALL_NTC_FLUSH",
-    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children."
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children. This event will also count cycles when the previous NTF instruction is still completing and the new NTF instruction is stalled at dispatch."
   },
   {
-    "EventCode": "2013C",
+    "EventCode": "0x2013C",
     "EventName": "PM_MRK_FX_LSU_FIN",
     "BriefDescription": "The marked instruction was simple fixed point that was issued to the store unit. Measured at finish time."
   },
   {
-    "EventCode": "2405A",
+    "EventCode": "0x2405A",
     "EventName": "PM_NTC_FIN",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status."
   },
   {
-    "EventCode": "201E2",
+    "EventCode": "0x201E2",
     "EventName": "PM_MRK_LD_MISS_L1",
     "BriefDescription": "Marked DL1 Demand Miss counted at finish time."
   },
   {
-    "EventCode": "200F4",
+    "EventCode": "0x200F4",
     "EventName": "PM_RUN_CYC",
     "BriefDescription": "Processor cycles gated by the run latch."
   },
   {
-    "EventCode": "30004",
-    "EventName": "PM_DISP_STALL_FLUSH",
-    "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC."
-  },
-  {
-    "EventCode": "30008",
+    "EventCode": "0x30008",
     "EventName": "PM_EXEC_STALL",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting to finish in one of the execution units (BRU, LSU, VSU). Only cycles between issue and finish are counted in this category."
   },
   {
-    "EventCode": "3001A",
+    "EventCode": "0x3001A",
     "EventName": "PM_LSU_ST2_FIN",
     "BriefDescription": "LSU Finished an internal operation in ST2 port."
   },
   {
-    "EventCode": "30020",
+    "EventCode": "0x30020",
     "EventName": "PM_PMC2_REWIND",
     "BriefDescription": "The speculative event selected for PMC2 rewinds and the counter for PMC2 is not charged."
   },
   {
-    "EventCode": "30022",
+    "EventCode": "0x30022",
     "EventName": "PM_PMC4_SAVED",
     "BriefDescription": "The conditions for the speculative event selected for PMC4 are met and PMC4 is charged."
   },
   {
-    "EventCode": "30024",
+    "EventCode": "0x30024",
     "EventName": "PM_PMC6_OVERFLOW",
     "BriefDescription": "The event selected for PMC6 caused the event counter to overflow."
   },
   {
-    "EventCode": "30028",
+    "EventCode": "0x30028",
     "EventName": "PM_CMPL_STALL_MEM_ECC",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC."
   },
   {
-    "EventCode": "30036",
+    "EventCode": "0x30036",
     "EventName": "PM_EXEC_STALL_SIMPLE_FX",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a simple fixed point instruction executing in the Load Store Unit."
   },
   {
-    "EventCode": "3003A",
+    "EventCode": "0x3003A",
     "EventName": "PM_CMPL_STALL_EXCEPTION",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete."
   },
   {
-    "EventCode": "3F044",
+    "EventCode": "0x3F044",
     "EventName": "PM_VSU2_ISSUE",
     "BriefDescription": "VSU instructions issued to VSU pipe 2."
   },
   {
-    "EventCode": "30058",
+    "EventCode": "0x30058",
     "EventName": "PM_TLBIE_FIN",
     "BriefDescription": "TLBIE instructions finished in the LSU. Two TLBIEs can finish each cycle. All will be counted."
   },
   {
-    "EventCode": "3D058",
+    "EventCode": "0x3D058",
     "EventName": "PM_SCALAR_FSQRT_FDIV_ISSUE",
     "BriefDescription": "Scalar versions of four floating point operations: fdiv,fsqrt (xvdivdp, xvdivsp, xvsqrtdp, xvsqrtsp)."
   },
   {
-    "EventCode": "30066",
+    "EventCode": "0x30066",
     "EventName": "PM_LSU_FIN",
     "BriefDescription": "LSU Finished an internal operation (up to 4 per cycle)."
   },
   {
-    "EventCode": "40004",
+    "EventCode": "0x40004",
     "EventName": "PM_FXU_ISSUE",
     "BriefDescription": "A fixed point instruction was issued to the VSU."
   },
   {
-    "EventCode": "40008",
+    "EventCode": "0x40008",
     "EventName": "PM_NTC_ALL_FIN",
     "BriefDescription": "Cycles in which both instructions in the ICT entry pair show as finished. These are the cycles between finish and completion for the oldest pair of instructions in the pipeline."
   },
   {
-    "EventCode": "40010",
+    "EventCode": "0x40010",
     "EventName": "PM_PMC3_OVERFLOW",
     "BriefDescription": "The event selected for PMC3 caused the event counter to overflow."
   },
   {
-    "EventCode": "4C012",
+    "EventCode": "0x4C012",
     "EventName": "PM_EXEC_STALL_DERAT_ONLY_MISS",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered an ERAT miss and waited for it resolve."
   },
   {
-    "EventCode": "4C018",
+    "EventCode": "0x4C018",
     "EventName": "PM_CMPL_STALL",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline cannot complete because the thread was blocked for any reason."
   },
   {
-    "EventCode": "4C01E",
+    "EventCode": "0x4C01E",
     "EventName": "PM_LSU_ST3_FIN",
     "BriefDescription": "LSU Finished an internal operation in ST3 port."
   },
   {
-    "EventCode": "4D018",
+    "EventCode": "0x4D018",
     "EventName": "PM_EXEC_STALL_BRU",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Branch unit."
   },
   {
-    "EventCode": "4D01A",
+    "EventCode": "0x4D01A",
     "EventName": "PM_CMPL_STALL_HWSYNC",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a hwsync waiting for response from L2 before completing."
   },
   {
-    "EventCode": "4D01C",
+    "EventCode": "0x4D01C",
     "EventName": "PM_EXEC_STALL_TLBIEL",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIEL instruction executing in the Load Store Unit. TLBIEL instructions have lower overhead than TLBIE instructions because they don't get set to the nest."
   },
   {
-    "EventCode": "4E012",
+    "EventCode": "0x4E012",
     "EventName": "PM_EXEC_STALL_UNKNOWN",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline completed without an ntf_type pulse. The ntf_pulse was missed by the ISU because the NTF finishes and completions came too close together."
   },
   {
-    "EventCode": "4D020",
+    "EventCode": "0x4D020",
     "EventName": "PM_VSU3_ISSUE",
     "BriefDescription": "VSU instruction was issued to VSU pipe 3."
   },
   {
-    "EventCode": "40132",
+    "EventCode": "0x40132",
     "EventName": "PM_MRK_LSU_FIN",
     "BriefDescription": "LSU marked instruction finish."
   },
   {
-    "EventCode": "45058",
+    "EventCode": "0x45058",
     "EventName": "PM_IC_MISS_CMPL",
     "BriefDescription": "Non-speculative icache miss, counted at completion."
   },
   {
-    "EventCode": "4D050",
+    "EventCode": "0x4D050",
     "EventName": "PM_VSU_NON_FLOP_CMPL",
     "BriefDescription": "Non-floating point VSU instructions completed."
   },
   {
-    "EventCode": "4D052",
+    "EventCode": "0x4D052",
     "EventName": "PM_2FLOP_CMPL",
     "BriefDescription": "Double Precision vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg completed."
   },
   {
-    "EventCode": "400F2",
+    "EventCode": "0x400F2",
     "EventName": "PM_1PLUS_PPC_DISP",
     "BriefDescription": "Cycles at least one Instr Dispatched."
   },
   {
-    "EventCode": "400F8",
+    "EventCode": "0x400F8",
     "EventName": "PM_FLUSH",
     "BriefDescription": "Flush (any type)."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
index ea122a91ceb0..b5d1bd39cfb2 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
@@ -1,21 +1,21 @@
 [
   {
-    "EventCode": "301E8",
+    "EventCode": "0x301E8",
     "EventName": "PM_THRESH_EXC_64",
     "BriefDescription": "Threshold counter exceeded a value of 64."
   },
   {
-    "EventCode": "45050",
+    "EventCode": "0x45050",
     "EventName": "PM_1FLOP_CMPL",
     "BriefDescription": "One floating point instruction completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)."
   },
   {
-    "EventCode": "45052",
+    "EventCode": "0x45052",
     "EventName": "PM_4FLOP_CMPL",
     "BriefDescription": "Four floating point instructions completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)."
   },
   {
-    "EventCode": "4D054",
+    "EventCode": "0x4D054",
     "EventName": "PM_8FLOP_CMPL",
     "BriefDescription": "Four Double Precision vector instructions completed."
   }
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/translation.json b/tools/perf/pmu-events/arch/powerpc/power10/translation.json
index 5a714e3dd71a..db3766dca07c 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/translation.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/translation.json
@@ -1,56 +1,56 @@
 [
   {
-    "EventCode": "1F15E",
+    "EventCode": "0x1F15E",
     "EventName": "PM_MRK_START_PROBE_NOP_CMPL",
     "BriefDescription": "Marked Start probe nop (AND R0,R0,R0) completed."
   },
   {
-    "EventCode": "20016",
+    "EventCode": "0x20016",
     "EventName": "PM_ST_FIN",
     "BriefDescription": "Store finish count. Includes speculative activity."
   },
   {
-    "EventCode": "20018",
+    "EventCode": "0x20018",
     "EventName": "PM_ST_FWD",
     "BriefDescription": "Store forwards that finished."
   },
   {
-    "EventCode": "2011C",
+    "EventCode": "0x2011C",
     "EventName": "PM_MRK_NTF_CYC",
     "BriefDescription": "Cycles during which the marked instruction is the oldest in the pipeline (NTF or NTC)."
   },
   {
-    "EventCode": "2E01C",
+    "EventCode": "0x2E01C",
     "EventName": "PM_EXEC_STALL_TLBIE",
     "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIE instruction executing in the Load Store Unit."
   },
   {
-    "EventCode": "201E6",
+    "EventCode": "0x201E6",
     "EventName": "PM_THRESH_EXC_32",
     "BriefDescription": "Threshold counter exceeded a value of 32."
   },
   {
-    "EventCode": "200F0",
+    "EventCode": "0x200F0",
     "EventName": "PM_ST_CMPL",
     "BriefDescription": "Stores completed from S2Q (2nd-level store queue). This event includes regular stores, stcx and cache inhibited stores. The following operations are excluded (pteupdate, snoop tlbie complete, store atomics, miso, load atomic payloads, tlbie, tlbsync, slbieg, isync, msgsnd, slbiag, cpabort, copy, tcheck, tend, stsync, dcbst, icbi, dcbf, hwsync, lwsync, ptesync, eieio, msgsync)."
   },
   {
-    "EventCode": "200FE",
+    "EventCode": "0x200FE",
     "EventName": "PM_DATA_FROM_L2MISS",
     "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss."
   },
   {
-    "EventCode": "30010",
+    "EventCode": "0x30010",
     "EventName": "PM_PMC2_OVERFLOW",
     "BriefDescription": "The event selected for PMC2 caused the event counter to overflow."
   },
   {
-    "EventCode": "4D010",
+    "EventCode": "0x4D010",
     "EventName": "PM_PMC1_SAVED",
     "BriefDescription": "The conditions for the speculative event selected for PMC1 are met and PMC1 is charged."
   },
   {
-    "EventCode": "4D05C",
+    "EventCode": "0x4D05C",
     "EventName": "PM_DPP_FLOP_CMPL",
     "BriefDescription": "Double-Precision or Quad-Precision instructions completed."
   }
-- 
cgit v1.2.3


From 000ac42953395a4f0a63d5db640c5e4c88a548c5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 May 2021 15:10:58 -0400
Subject: selftests: kvm: fix overlapping addresses in memslot_perf_test

vm_create allocates memory and maps it close to GPA.  This memory
is separate from what is allocated in subsequent calls to
vm_userspace_mem_region_add, so it is incorrect to pass the
test memory size to vm_create_default.  Just pass a small
fixed amount of memory which can be used later for page table,
otherwise GPAs are already allocated at MEM_GPA and the
test aborts.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/memslot_perf_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 11239652d805..9307f25d8130 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
 	data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
 	TEST_ASSERT(data->hva_slots, "malloc() fail");
 
-	data->vm = vm_create_default(VCPU_ID, mempages, guest_code);
+	data->vm = vm_create_default(VCPU_ID, 1024, guest_code);
 
 	pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
 		max_mem_slots - 1, data->pages_per_slot, rempages);
-- 
cgit v1.2.3


From 6c1ced2f701618e912be6c549139d58c180419ea Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 1 Jun 2021 19:53:56 +0800
Subject: perf tools: Copy uapi/asm/perf_regs.h from the kernel for MIPS

To allow the build to complete on older systems, where those files are
either not uptodate, lacking some recent additions or not present at
all.

And check if the copy drifts from the kernel.

This commit is similar with
commit 12f020338a2c ("tools: Copy uapi/asm/perf_regs.h from the kernel")

With this commit, we can avoid the following build error in any case:

tools/perf/arch/mips/include/perf_regs.h:7:10:
fatal error: asm/perf_regs.h: No such file or directory
 #include <asm/perf_regs.h>
          ^~~~~~~~~~~~~~~~~
compilation terminated.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xuefeng Li <lixuefeng@loongson.cn>
Link: http://lore.kernel.org/lkml/1622548436-12472-1-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/mips/include/uapi/asm/perf_regs.h | 40 ++++++++++++++++++++++++++++
 tools/perf/Makefile.config                   |  1 -
 tools/perf/check-headers.sh                  |  1 +
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 tools/arch/mips/include/uapi/asm/perf_regs.h

(limited to 'tools')

diff --git a/tools/arch/mips/include/uapi/asm/perf_regs.h b/tools/arch/mips/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000000..d0f4ecd616cf
--- /dev/null
+++ b/tools/arch/mips/include/uapi/asm/perf_regs.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_MIPS_PERF_REGS_H
+#define _ASM_MIPS_PERF_REGS_H
+
+enum perf_event_mips_regs {
+	PERF_REG_MIPS_PC,
+	PERF_REG_MIPS_R1,
+	PERF_REG_MIPS_R2,
+	PERF_REG_MIPS_R3,
+	PERF_REG_MIPS_R4,
+	PERF_REG_MIPS_R5,
+	PERF_REG_MIPS_R6,
+	PERF_REG_MIPS_R7,
+	PERF_REG_MIPS_R8,
+	PERF_REG_MIPS_R9,
+	PERF_REG_MIPS_R10,
+	PERF_REG_MIPS_R11,
+	PERF_REG_MIPS_R12,
+	PERF_REG_MIPS_R13,
+	PERF_REG_MIPS_R14,
+	PERF_REG_MIPS_R15,
+	PERF_REG_MIPS_R16,
+	PERF_REG_MIPS_R17,
+	PERF_REG_MIPS_R18,
+	PERF_REG_MIPS_R19,
+	PERF_REG_MIPS_R20,
+	PERF_REG_MIPS_R21,
+	PERF_REG_MIPS_R22,
+	PERF_REG_MIPS_R23,
+	PERF_REG_MIPS_R24,
+	PERF_REG_MIPS_R25,
+	PERF_REG_MIPS_R26,
+	PERF_REG_MIPS_R27,
+	PERF_REG_MIPS_R28,
+	PERF_REG_MIPS_R29,
+	PERF_REG_MIPS_R30,
+	PERF_REG_MIPS_R31,
+	PERF_REG_MIPS_MAX = PERF_REG_MIPS_R31 + 1,
+};
+#endif /* _ASM_MIPS_PERF_REGS_H */
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 406a9519145e..73df23dd664c 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -90,7 +90,6 @@ endif
 ifeq ($(ARCH),mips)
   NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/mips/include/generated
-  CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi
   LIBUNWIND_LIBS = -lunwind -lunwind-mips
 endif
 
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index dd8ff287e930..c783558332b8 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -39,6 +39,7 @@ arch/x86/lib/x86-opcode-map.txt
 arch/x86/tools/gen-insn-attr-x86.awk
 arch/arm/include/uapi/asm/perf_regs.h
 arch/arm64/include/uapi/asm/perf_regs.h
+arch/mips/include/uapi/asm/perf_regs.h
 arch/powerpc/include/uapi/asm/perf_regs.h
 arch/s390/include/uapi/asm/perf_regs.h
 arch/x86/include/uapi/asm/perf_regs.h
-- 
cgit v1.2.3


From 3cb17cce1e76ccc5499915a4d7e095a1ad6bf7ff Mon Sep 17 00:00:00 2001
From: Li Huafei <lihuafei1@huawei.com>
Date: Tue, 1 Jun 2021 17:27:50 +0800
Subject: perf probe: Fix NULL pointer dereference in
 convert_variable_location()

If we just check whether the variable can be converted, 'tvar' should be
a null pointer. However, the null pointer check is missing in the
'Constant value' execution path.

The following cases can trigger this problem:

	$ cat test.c
	#include <stdio.h>

	void main(void)
	{
	        int a;
	        const int b = 1;

	        asm volatile("mov %1, %0" : "=r"(a): "i"(b));
	        printf("a: %d\n", a);
	}

	$ gcc test.c -o test -O -g
	$ sudo ./perf probe -x ./test -L "main"
	<main@/home/lhf/test.c:0>
	      0  void main(void)
	         {
	      2          int a;
	                 const int b = 1;

	                 asm volatile("mov %1, %0" : "=r"(a): "i"(b));
	      6          printf("a: %d\n", a);
	         }

	$ sudo ./perf probe -x ./test -V "main:6"
	Segmentation fault

The check on 'tvar' is added. If 'tavr' is a null pointer, we return 0
to indicate that the variable can be converted. Now, we can successfully
show the variables that can be accessed.

	$ sudo ./perf probe -x ./test -V "main:6"
	Available variables at main:6
	        @<main+13>
	                char*   __fmt
	                int     a
	                int     b

However, the variable 'b' cannot be tracked.

	$ sudo ./perf probe -x ./test -D "main:6 b"
	Failed to find the location of the 'b' variable at this address.
	 Perhaps it has been optimized out.
	 Use -V with the --range option to show 'b' location range.
	  Error: Failed to add events.

This is because __die_find_variable_cb() did not successfully match
variable 'b', which has the DW_AT_const_value attribute instead of
DW_AT_location. We added support for DW_AT_const_value in
__die_find_variable_cb(). With this modification, we can successfully
track the variable 'b'.

	$ sudo ./perf probe -x ./test -D "main:6 b"
	p:probe_test/main_L6 /home/lhf/test:0x1156 b=\1:s32

Fixes: 66f69b219716 ("perf probe: Support DW_AT_const_value constant value")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Jianlin Lv <jianlin.lv@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Yang Jihong <yangjihong1@huawei.com>
Cc: Zhang Jinhao <zhangjinhao2@huawei.com>
http://lore.kernel.org/lkml/20210601092750.169601-1-lihuafei1@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-aux.c    | 8 ++++++--
 tools/perf/util/probe-finder.c | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index b2f4920e19a6..7d2ba8419b0c 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -975,9 +975,13 @@ static int __die_find_variable_cb(Dwarf_Die *die_mem, void *data)
 	if ((tag == DW_TAG_formal_parameter ||
 	     tag == DW_TAG_variable) &&
 	    die_compare_name(die_mem, fvp->name) &&
-	/* Does the DIE have location information or external instance? */
+	/*
+	 * Does the DIE have location information or const value
+	 * or external instance?
+	 */
 	    (dwarf_attr(die_mem, DW_AT_external, &attr) ||
-	     dwarf_attr(die_mem, DW_AT_location, &attr)))
+	     dwarf_attr(die_mem, DW_AT_location, &attr) ||
+	     dwarf_attr(die_mem, DW_AT_const_value, &attr)))
 		return DIE_FIND_CB_END;
 	if (dwarf_haspc(die_mem, fvp->addr))
 		return DIE_FIND_CB_CONTINUE;
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 866f2d514d72..b029c29ce227 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -190,6 +190,9 @@ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
 	    immediate_value_is_supported()) {
 		Dwarf_Sword snum;
 
+		if (!tvar)
+			return 0;
+
 		dwarf_formsdata(&attr, &snum);
 		ret = asprintf(&tvar->value, "\\%ld", (long)snum);
 
-- 
cgit v1.2.3


From 4f2abe91922ba02bb419d91d92a518e4c805220b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 27 May 2021 11:28:35 -0700
Subject: perf record: Move probing cgroup sampling support

I found that checking cgroup sampling support using the missing features
doesn't work on old kernels.  Because it added both attr.cgroup bit and
PERF_SAMPLE_CGROUP bit, it needs to check whichever comes first (usually
the actual event, not dummy).

But it only checks the attr.cgroup bit which is set only in the dummy
event so cannot detect failtures due the sample bits.  Also we don't
ignore the missing feature and retry, it'd be better checking it with
the API probing logic.

Committer notes:

Extracted the minimal part to check using the new cgroup API probe
routine, the part that removes the cgroup member can be left for further
discussion.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20210527182835.1634339-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c      |  6 ++++++
 tools/perf/util/perf_api_probe.c | 10 ++++++++++
 tools/perf/util/perf_api_probe.h |  1 +
 3 files changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3337b5f93336..84803abeb942 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2714,6 +2714,12 @@ int cmd_record(int argc, const char **argv)
 		rec->no_buildid = true;
 	}
 
+	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
+		pr_err("Kernel has no cgroup sampling support.\n");
+		err = -EINVAL;
+		goto out_opts;
+	}
+
 	if (rec->opts.kcore)
 		rec->data.is_dir = true;
 
diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c
index 829af17a0867..020411682a3c 100644
--- a/tools/perf/util/perf_api_probe.c
+++ b/tools/perf/util/perf_api_probe.c
@@ -103,6 +103,11 @@ static void perf_probe_build_id(struct evsel *evsel)
 	evsel->core.attr.build_id = 1;
 }
 
+static void perf_probe_cgroup(struct evsel *evsel)
+{
+	evsel->core.attr.cgroup = 1;
+}
+
 bool perf_can_sample_identifier(void)
 {
 	return perf_probe_api(perf_probe_sample_identifier);
@@ -182,3 +187,8 @@ bool perf_can_record_build_id(void)
 {
 	return perf_probe_api(perf_probe_build_id);
 }
+
+bool perf_can_record_cgroup(void)
+{
+	return perf_probe_api(perf_probe_cgroup);
+}
diff --git a/tools/perf/util/perf_api_probe.h b/tools/perf/util/perf_api_probe.h
index f12ca55f509a..b104168efb15 100644
--- a/tools/perf/util/perf_api_probe.h
+++ b/tools/perf/util/perf_api_probe.h
@@ -12,5 +12,6 @@ bool perf_can_record_switch_events(void);
 bool perf_can_record_text_poke_events(void);
 bool perf_can_sample_identifier(void);
 bool perf_can_record_build_id(void);
+bool perf_can_record_cgroup(void);
 
 #endif // __PERF_API_PROBE_H
-- 
cgit v1.2.3


From d3fddc355a4a4415e8d43d1faae1be713d65cf5e Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Mon, 17 May 2021 16:12:54 +0800
Subject: perf stat: Fix error return code in bperf__load()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix to return a negative error code from the error handling case instead
of 0, as done elsewhere in this function.

Committer notes:

Added the missing {} for the now multiline 'if' block, fixing this error:

    CC      /tmp/build/perf/util/bpf_counter.o
  util/bpf_counter.c: In function ‘bperf__load’:
  util/bpf_counter.c:523:9: error: this ‘if’ clause does not guard... [-Werror=misleading-indentation]
    523 |         if (evsel->bperf_leader_link_fd < 0 &&
        |         ^~
  util/bpf_counter.c:526:17: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’
    526 |                 goto out;
        |                 ^~~~
  cc1: all warnings being treated as errors

Fixes: 7fac83aaf2eecc9e ("perf stat: Introduce 'bperf' to share hardware PMCs with BPF")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Zhang Yi <yi.zhang@huawei.com>
Link: http://lore.kernel.org/lkml/20210517081254.1561564-1-yukuai3@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_counter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 974f10e356f0..5ed674a2f55e 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -521,9 +521,10 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 
 	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
 	if (evsel->bperf_leader_link_fd < 0 &&
-	    bperf_reload_leader_program(evsel, attr_map_fd, &entry))
+	    bperf_reload_leader_program(evsel, attr_map_fd, &entry)) {
+		err = -1;
 		goto out;
-
+	}
 	/*
 	 * The bpf_link holds reference to the leader program, and the
 	 * leader program holds reference to the maps. Therefore, if
@@ -550,6 +551,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 	/* Step 2: load the follower skeleton */
 	evsel->follower_skel = bperf_follower_bpf__open();
 	if (!evsel->follower_skel) {
+		err = -1;
 		pr_err("Failed to open follower skeleton\n");
 		goto out;
 	}
-- 
cgit v1.2.3


From f677ec94f6fb9d895f40403bd54236f7763c29db Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Fri, 28 May 2021 11:10:50 +0200
Subject: perf test: Test 17 fails with make LIBPFM4=1 on s390 z/VM

This test case fails on s390 virtual machine z/VM which has no PMU support
when the perf tool is built with LIBPFM4=1.

Using make LIBPFM4=1 builds the perf tool with support for libpfm
event notation. The command line flag --pfm-events is valid:
 # ./perf record --pfm-events cycles -- true
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.001 MB perf.data (2 samples) ]
 #

However the command 'perf test -Fv 17' fails on s390 z/VM virtual machine
with LIBPFM4=1:
  # perf test -Fv 17
  17: Setup struct perf_event_attr                                    :
  --- start ---
  .....
  running './tests/attr/test-record-group2'
  unsupp  './tests/attr/test-record-group2'
  running './tests/attr/test-record-pfm-period'
  expected exclude_hv=0, got 1
 FAILED './tests/attr/test-record-pfm-period' - match failure
 ---- end ----
 Setup struct perf_event_attr: FAILED!

When --pfm-event system is not supported, the test returns unsupported
and continues. Here is an example using a virtual machine on x86 and
Fedora 34:
 [root@f33 perf]# perf test -Fv 17
 17: Setup struct perf_event_attr                                    :
 --- start ---
 .....
 running './tests/attr/test-record-group2'
 unsupp  './tests/attr/test-record-group2'
 running './tests/attr/test-record-pfm-period'
 unsupp  './tests/attr/test-record-pfm-period'
 ....

The issue is file ./tests/attr/test-record-pfm-period
which requires perf event attribute member exclude_hv to be zero.
This is not the case on s390 where the value of exclude_hv is one when
executing on a z/VM virtual machine without PMU hardware support.

Fix this by allowing value exlucde_hv to be zero or one.

Output before:
 # /usr/bin/python ./tests/attr.py -d ./tests/attr/ -t \
	test-record-pfm-period -p ./perf  -vvv 2>&1| fgrep match
    matching [event:base-record]
    match: [event:base-record] matches []
 FAILED './tests/attr//test-record-pfm-period' - match failure
 #

Output after:
 # /usr/bin/python ./tests/attr.py -d ./tests/attr/ -t \
	test-record-pfm-period -p ./perf  -vvv 2>&1| fgrep match
    matching [event:base-record]
    match: [event:base-record] matches ['event-1-0-6', 'event-1-0-5']
  matched

Background:
Using libpfm library ends up in this function call sequence

pfm_get_perf_event_encoding()
+-- pfm_get_os_event_encoding()
    +-- pfmlib_perf_event_encode()

is called when no hardware specific PMU unit can be detected
as in the s390 z/VM virtual machine case. This uses the
"perf_events generic PMU" data structure which sets exclude_hv
to 1 per default.  Using this PMU that test case always fails.

That is the reason why exclude_hv attribute setting varies.

Version 2:

   As suggested by Ian Rogers make perf_event_attribute member
   exclude_hv more robust and accept value 0 or 1 to handle more
   test cases which might fail on s390 virtual machine z/VM.

Suggested-by: Ian Rogers <irogers@google.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: http://lore.kernel.org/lkml/20210528091050.245838-1-tmricht@linux.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/attr/base-record | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record
index 4a7b8deef3fd..8c10955eff93 100644
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record
@@ -16,7 +16,7 @@ pinned=0
 exclusive=0
 exclude_user=0
 exclude_kernel=0|1
-exclude_hv=0
+exclude_hv=0|1
 exclude_idle=0
 mmap=1
 comm=1
-- 
cgit v1.2.3


From 2dc065eae56df804e4da5f8a9e4139033f7ea605 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 2 Jun 2021 14:22:40 -0700
Subject: perf evsel: Add missing cloning of evsel->use_config_name

The evsel__clone() should copy all fields in the evsel which are set
during the event parsing.  But it missed the use_config_name field.

Fixes: 12279429d862 ("perf stat: Uniquify hybrid event name")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20210602212241.2175005-2-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 1 +
 tools/perf/util/evsel.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4a3cd1b5bb33..a8d8463f8ee5 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -428,6 +428,7 @@ struct evsel *evsel__clone(struct evsel *orig)
 	evsel->auto_merge_stats = orig->auto_merge_stats;
 	evsel->collect_stat = orig->collect_stat;
 	evsel->weak_group = orig->weak_group;
+	evsel->use_config_name = orig->use_config_name;
 
 	if (evsel__copy_config_terms(evsel, orig) < 0)
 		goto out_err;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 75cf5dbfe208..bdad52a06438 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -83,8 +83,10 @@ struct evsel {
 		bool			collect_stat;
 		bool			weak_group;
 		bool			bpf_counter;
+		bool			use_config_name;
 		int			bpf_fd;
 		struct bpf_object	*bpf_obj;
+		struct list_head	config_terms;
 	};
 
 	/*
@@ -116,10 +118,8 @@ struct evsel {
 	bool			merged_stat;
 	bool			reset_group;
 	bool			errored;
-	bool			use_config_name;
 	struct hashmap		*per_pkg_mask;
 	struct evsel		*leader;
-	struct list_head	config_terms;
 	int			err;
 	int			cpu_iter;
 	struct {
-- 
cgit v1.2.3


From 3cc84399e9b60463bc39cf352ffd8bccb92e02bd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 2 Jun 2021 14:22:41 -0700
Subject: perf stat: Honor event config name on --no-merge

If user gave an event name explicitly, it should be displayed in the
output as is.  But with --no-merge option it adds a pmu name at the
end so might confuse users.

Actually this is true for hybrid pmus, I think we should do the same
for others.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20210602212241.2175005-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index a76fff5e7d83..ca326f98c7a2 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -541,7 +541,7 @@ static void uniquify_event_name(struct evsel *counter)
 	char *config;
 	int ret = 0;
 
-	if (counter->uniquified_name ||
+	if (counter->uniquified_name || counter->use_config_name ||
 	    !counter->pmu_name || !strncmp(counter->name, counter->pmu_name,
 					   strlen(counter->pmu_name)))
 		return;
@@ -555,10 +555,8 @@ static void uniquify_event_name(struct evsel *counter)
 		}
 	} else {
 		if (perf_pmu__has_hybrid()) {
-			if (!counter->use_config_name) {
-				ret = asprintf(&new_name, "%s/%s/",
-					       counter->pmu_name, counter->name);
-			}
+			ret = asprintf(&new_name, "%s/%s/",
+				       counter->pmu_name, counter->name);
 		} else {
 			ret = asprintf(&new_name, "%s [%s]",
 				       counter->name, counter->pmu_name);
-- 
cgit v1.2.3


From 69c9ffed6cede9c11697861f654946e3ae95a930 Mon Sep 17 00:00:00 2001
From: Riccardo Mancini <rickyman7@gmail.com>
Date: Thu, 3 Jun 2021 00:08:33 +0200
Subject: perf symbol-elf: Fix memory leak by freeing sdt_note.args

Reported by ASan.

Signed-off-by: Riccardo Mancini <rickyman7@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fabian Hemmer <copy@copy.sh>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Remi Bernon <rbernon@codeweavers.com>
Cc: Jiri Slaby <jirislaby@kernel.org>
Link: http://lore.kernel.org/lkml/20210602220833.285226-1-rickyman7@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol-elf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 4c56aa837434..a73345730ba9 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -2412,6 +2412,7 @@ int cleanup_sdt_note_list(struct list_head *sdt_notes)
 
 	list_for_each_entry_safe(pos, tmp, sdt_notes, note_list) {
 		list_del_init(&pos->note_list);
+		zfree(&pos->args);
 		zfree(&pos->name);
 		zfree(&pos->provider);
 		free(pos);
-- 
cgit v1.2.3


From 67069a1f0fe5f9eeca86d954fff2087f5542a008 Mon Sep 17 00:00:00 2001
From: Riccardo Mancini <rickyman7@gmail.com>
Date: Thu, 3 Jun 2021 00:40:23 +0200
Subject: perf env: Fix memory leak of bpf_prog_info_linear member

ASan reported a memory leak caused by info_linear not being deallocated.

The info_linear was allocated during in perf_event__synthesize_one_bpf_prog().

This patch adds the corresponding free() when bpf_prog_info_node
is freed in perf_env__purge_bpf().

  $ sudo ./perf record -- sleep 5
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.025 MB perf.data (8 samples) ]

  =================================================================
  ==297735==ERROR: LeakSanitizer: detected memory leaks

  Direct leak of 7688 byte(s) in 19 object(s) allocated from:
      #0 0x4f420f in malloc (/home/user/linux/tools/perf/perf+0x4f420f)
      #1 0xc06a74 in bpf_program__get_prog_info_linear /home/user/linux/tools/lib/bpf/libbpf.c:11113:16
      #2 0xb426fe in perf_event__synthesize_one_bpf_prog /home/user/linux/tools/perf/util/bpf-event.c:191:16
      #3 0xb42008 in perf_event__synthesize_bpf_events /home/user/linux/tools/perf/util/bpf-event.c:410:9
      #4 0x594596 in record__synthesize /home/user/linux/tools/perf/builtin-record.c:1490:8
      #5 0x58c9ac in __cmd_record /home/user/linux/tools/perf/builtin-record.c:1798:8
      #6 0x58990b in cmd_record /home/user/linux/tools/perf/builtin-record.c:2901:8
      #7 0x7b2a20 in run_builtin /home/user/linux/tools/perf/perf.c:313:11
      #8 0x7b12ff in handle_internal_command /home/user/linux/tools/perf/perf.c:365:8
      #9 0x7b2583 in run_argv /home/user/linux/tools/perf/perf.c:409:2
      #10 0x7b0d79 in main /home/user/linux/tools/perf/perf.c:539:3
      #11 0x7fa357ef6b74 in __libc_start_main /usr/src/debug/glibc-2.33-8.fc34.x86_64/csu/../csu/libc-start.c:332:16

Signed-off-by: Riccardo Mancini <rickyman7@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Link: http://lore.kernel.org/lkml/20210602224024.300485-1-rickyman7@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/env.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 9130f6fad8d5..bc5e4f294e9e 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -144,6 +144,7 @@ static void perf_env__purge_bpf(struct perf_env *env)
 		node = rb_entry(next, struct bpf_prog_info_node, rb_node);
 		next = rb_next(&node->rb_node);
 		rb_erase(&node->rb_node, root);
+		free(node->info_linear);
 		free(node);
 	}
 
-- 
cgit v1.2.3


From 263e88d678baa1a2e3f2d5afbdcd9fd3feb80a4d Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 4 Jun 2021 20:01:30 -0700
Subject: proc: add .gitignore for proc-subset-pid selftest

This new selftest needs an entry in the .gitignore file otherwise git
will try to track the binary.

Link: https://lkml.kernel.org/r/20210601164305.11776-1-dmatlack@google.com
Fixes: 268af17ada5855 ("selftests: proc: test subset=pid")
Signed-off-by: David Matlack <dmatlack@google.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexey Gladkov <gladkov.alexey@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/proc/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index bed4b5318a86..8f3e72e626fa 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -10,6 +10,7 @@
 /proc-self-map-files-002
 /proc-self-syscall
 /proc-self-wchan
+/proc-subset-pid
 /proc-uptime-001
 /proc-uptime-002
 /read
-- 
cgit v1.2.3


From 11fc79fc9f2e395aa39fa5baccae62767c5d8280 Mon Sep 17 00:00:00 2001
From: Kev Jackson <foamdino@gmail.com>
Date: Mon, 7 Jun 2021 14:08:35 +0100
Subject: libbpf: Fixes incorrect rx_ring_setup_done

When calling xsk_socket__create_shared(), the logic at line 1097 marks a
boolean flag true within the xsk_umem structure to track setup progress
in order to support multiple calls to the function.  However, instead of
marking umem->tx_ring_setup_done, the code incorrectly sets
umem->rx_ring_setup_done.  This leads to improper behaviour when
creating and destroying xsk and umem structures.

Multiple calls to this function is documented as supported.

Fixes: ca7a83e2487a ("libbpf: Only create rx and tx XDP rings when necessary")
Signed-off-by: Kev Jackson <foamdino@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/YL4aU4f3Aaik7CN0@linux-dev
---
 tools/lib/bpf/xsk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 6061431ee04c..e9b619aa0cdf 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -1094,7 +1094,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 			goto out_put_ctx;
 		}
 		if (xsk->fd == umem->fd)
-			umem->rx_ring_setup_done = true;
+			umem->tx_ring_setup_done = true;
 	}
 
 	err = xsk_get_mmap_offsets(xsk->fd, &off);
-- 
cgit v1.2.3


From 1bc603af73dd8fb2934306e861009c54f973dcc2 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 8 Jun 2021 14:39:54 +0200
Subject: KVM: selftests: introduce P47V64 for s390x

s390x can have up to 47bits of physical guest and 64bits of virtual
address  bits. Add a new address mode to avoid errors of testcases
going beyond 47bits.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Message-Id: <20210608123954.10991-1-borntraeger@de.ibm.com>
Fixes: ef4c9f4f6546 ("KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()")
Cc: stable@vger.kernel.org
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 3 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c     | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index fcd8e3855111..b602552b1ed0 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -43,6 +43,7 @@ enum vm_guest_mode {
 	VM_MODE_P40V48_4K,
 	VM_MODE_P40V48_64K,
 	VM_MODE_PXXV48_4K,	/* For 48bits VA but ANY bits PA */
+	VM_MODE_P47V64_4K,
 	NUM_VM_MODES,
 };
 
@@ -60,7 +61,7 @@ enum vm_guest_mode {
 
 #elif defined(__s390x__)
 
-#define VM_MODE_DEFAULT			VM_MODE_P52V48_4K
+#define VM_MODE_DEFAULT			VM_MODE_P47V64_4K
 #define MIN_PAGE_SHIFT			12U
 #define ptes_per_page(page_size)	((page_size) / 16)
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 28e528c19d28..b126fab6c4e1 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -175,6 +175,7 @@ const char *vm_guest_mode_string(uint32_t i)
 		[VM_MODE_P40V48_4K]	= "PA-bits:40,  VA-bits:48,  4K pages",
 		[VM_MODE_P40V48_64K]	= "PA-bits:40,  VA-bits:48, 64K pages",
 		[VM_MODE_PXXV48_4K]	= "PA-bits:ANY, VA-bits:48,  4K pages",
+		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
 	};
 	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
 		       "Missing new mode strings?");
@@ -192,6 +193,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
 	{ 40, 48,  0x1000, 12 },
 	{ 40, 48, 0x10000, 16 },
 	{  0,  0,  0x1000, 12 },
+	{ 47, 64,  0x1000, 12 },
 };
 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
 	       "Missing new mode params?");
@@ -277,6 +279,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 		TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
 #endif
 		break;
+	case VM_MODE_P47V64_4K:
+		vm->pgtable_levels = 5;
+		break;
 	default:
 		TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
 	}
-- 
cgit v1.2.3


From f53b16ad64408b5376836708f8cf42dbf1cf6098 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Wed, 9 Jun 2021 07:38:16 +0800
Subject: selftests: kvm: Add support for customized slot0 memory size

Until commit 39fe2fc96694 ("selftests: kvm: make allocation of extra
memory take effect", 2021-05-27), parameter extra_mem_pages was used
only to calculate the page table size for all the memory chunks,
because real memory allocation happened with calls of
vm_userspace_mem_region_add() after vm_create_default().

Commit 39fe2fc96694 however changed the meaning of extra_mem_pages to
the size of memory slot 0.  This makes the memory allocation more
flexible, but makes it harder to account for the number of
pages needed for the page tables.  For example, memslot_perf_test
has a small amount of memory in slot 0 but a lot in other slots,
and adding that memory twice (both in slot 0 and with later
calls to vm_userspace_mem_region_add()) causes an error that
was fixed in commit 000ac4295339 ("selftests: kvm: fix overlapping
addresses in memslot_perf_test", 2021-05-29)

Since both uses are sensible, add a new parameter slot0_mem_pages
to vm_create_with_vcpus() and some comments to clarify the meaning of
slot0_mem_pages and extra_mem_pages.  With this change,
memslot_perf_test can go back to passing the number of memory
pages as extra_mem_pages.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Message-Id: <20210608233816.423958-4-zhenzhong.duan@intel.com>
[Squashed in a single patch and rewrote the commit message. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h    |  7 ++--
 tools/testing/selftests/kvm/kvm_page_table_test.c |  2 +-
 tools/testing/selftests/kvm/lib/kvm_util.c        | 47 ++++++++++++++++++-----
 tools/testing/selftests/kvm/lib/perf_test_util.c  |  2 +-
 tools/testing/selftests/kvm/memslot_perf_test.c   |  2 +-
 5 files changed, 45 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index b602552b1ed0..35739567189e 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -286,10 +286,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me
 					    uint32_t num_percpu_pages, void *guest_code,
 					    uint32_t vcpuids[]);
 
-/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */
+/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */
 struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
-				    uint64_t extra_mem_pages, uint32_t num_percpu_pages,
-				    void *guest_code, uint32_t vcpuids[]);
+				    uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
+				    uint32_t num_percpu_pages, void *guest_code,
+				    uint32_t vcpuids[]);
 
 /*
  * Adds a vCPU with reasonable defaults (e.g. a stack)
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index 1c4753fff19e..82171f17c1d7 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
 
 	/* Create a VM with enough guest pages */
 	guest_num_pages = test_mem_size / guest_page_size;
-	vm = vm_create_with_vcpus(mode, nr_vcpus,
+	vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES,
 				  guest_num_pages, 0, guest_code, NULL);
 
 	/* Align down GPA of the testing memslot */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index b126fab6c4e1..5c70596dd1b9 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -313,21 +313,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 	return vm;
 }
 
+/*
+ * VM Create with customized parameters
+ *
+ * Input Args:
+ *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
+ *   nr_vcpus - VCPU count
+ *   slot0_mem_pages - Slot0 physical memory size
+ *   extra_mem_pages - Non-slot0 physical memory total size
+ *   num_percpu_pages - Per-cpu physical memory pages
+ *   guest_code - Guest entry point
+ *   vcpuids - VCPU IDs
+ *
+ * Output Args: None
+ *
+ * Return:
+ *   Pointer to opaque structure that describes the created VM.
+ *
+ * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K),
+ * with customized slot0 memory size, at least 512 pages currently.
+ * extra_mem_pages is only used to calculate the maximum page table size,
+ * no real memory allocation for non-slot0 memory in this function.
+ */
 struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
-				    uint64_t extra_mem_pages, uint32_t num_percpu_pages,
-				    void *guest_code, uint32_t vcpuids[])
+				    uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
+				    uint32_t num_percpu_pages, void *guest_code,
+				    uint32_t vcpuids[])
 {
+	uint64_t vcpu_pages, extra_pg_pages, pages;
+	struct kvm_vm *vm;
+	int i;
+
+	/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
+	if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
+		slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
+
 	/* The maximum page table size for a memory region will be when the
 	 * smallest pages are used. Considering each page contains x page
 	 * table descriptors, the total extra size for page tables (for extra
 	 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
 	 * than N/x*2.
 	 */
-	uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
-	uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
-	uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages;
-	struct kvm_vm *vm;
-	int i;
+	vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
+	extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
+	pages = slot0_mem_pages + vcpu_pages + extra_pg_pages;
 
 	TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
 		    "nr_vcpus = %d too large for host, max-vcpus = %d",
@@ -359,8 +388,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me
 					    uint32_t num_percpu_pages, void *guest_code,
 					    uint32_t vcpuids[])
 {
-	return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages,
-				    num_percpu_pages, guest_code, vcpuids);
+	return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES,
+				    extra_mem_pages, num_percpu_pages, guest_code, vcpuids);
 }
 
 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index abf381800a59..7397ca299835 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -69,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
 	TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
 		    "Guest memory size is not guest page size aligned.");
 
-	vm = vm_create_with_vcpus(mode, vcpus,
+	vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES,
 				  (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size,
 				  0, guest_code, NULL);
 
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 9307f25d8130..11239652d805 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
 	data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
 	TEST_ASSERT(data->hva_slots, "malloc() fail");
 
-	data->vm = vm_create_default(VCPU_ID, 1024, guest_code);
+	data->vm = vm_create_default(VCPU_ID, mempages, guest_code);
 
 	pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
 		max_mem_slots - 1, data->pages_per_slot, rempages);
-- 
cgit v1.2.3


From e8ba0b2b64126381643bb50df3556b139a60545a Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 8 May 2021 11:42:16 +0800
Subject: tools/bootconfig: Fix error return code in apply_xbc()

Fix to return a negative error code from the error handling case instead
of 0, as done elsewhere in this function.

Link: https://lkml.kernel.org/r/20210508034216.2277-1-thunder.leizhen@huawei.com

Fixes: a995e6bc0524 ("tools/bootconfig: Fix to check the write failure correctly")
Reported-by: Hulk Robot <hulkci@huawei.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/bootconfig/main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 7362bef1a368..6cd6080cac04 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -399,6 +399,7 @@ static int apply_xbc(const char *path, const char *xbc_path)
 	}
 	/* TODO: Ensure the @path is initramfs/initrd image */
 	if (fstat(fd, &stat) < 0) {
+		ret = -errno;
 		pr_err("Failed to get the size of %s\n", path);
 		goto out;
 	}
-- 
cgit v1.2.3


From 824afd55e95c3cb12c55d297a0ae408be1779cc8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 13 May 2021 12:06:33 +0900
Subject: tools/bootconfig: Fix a build error accroding to undefined
 fallthrough

Since the "fallthrough" is defined only in the kernel, building
lib/bootconfig.c as a part of user-space tools causes a build
error.

Add a dummy fallthrough to avoid the build error.

Link: https://lkml.kernel.org/r/162087519356.442660.11385099982318160180.stgit@devnote2

Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Fixes: 4c1ca831adb1 ("Revert "lib: Revert use of fallthrough pseudo-keyword in lib/"")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/bootconfig/include/linux/bootconfig.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
index 078cbd2ba651..de7f30f99af3 100644
--- a/tools/bootconfig/include/linux/bootconfig.h
+++ b/tools/bootconfig/include/linux/bootconfig.h
@@ -4,4 +4,8 @@
 
 #include "../../../../include/linux/bootconfig.h"
 
+#ifndef fallthrough
+# define fallthrough
+#endif
+
 #endif
-- 
cgit v1.2.3


From 82944421243e5988898f54266687586ba07d889e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Jun 2021 13:48:17 +0200
Subject: selftests: netfilter: add fib test case

There is a bug report on netfilter.org bugzilla pointing to fib
expression dropping ipv6 DAD packets.

Add a test case that demonstrates this problem.

Next patch excludes icmpv6 packets coming from any to linklocal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/Makefile   |   2 +-
 tools/testing/selftests/netfilter/nft_fib.sh | 221 +++++++++++++++++++++++++++
 2 files changed, 222 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/netfilter/nft_fib.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
index 3171069a6b46..cd6430b39982 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for netfilter selftests
 
-TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \
+TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
 	conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
 	nft_concat_range.sh nft_conntrack_helper.sh \
 	nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh
new file mode 100755
index 000000000000..6caf6ac8c285
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_fib.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+#
+# This tests the fib expression.
+#
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+nsrouter="nsrouter-$sfx"
+timeout=4
+
+log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
+
+cleanup()
+{
+	ip netns del ${ns1}
+	ip netns del ${ns2}
+	ip netns del ${nsrouter}
+
+	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
+}
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+ip netns add ${nsrouter}
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not create net namespace"
+	exit $ksft_skip
+fi
+
+trap cleanup EXIT
+
+dmesg | grep -q ' nft_rpfilter: '
+if [ $? -eq 0 ]; then
+	dmesg -c | grep ' nft_rpfilter: '
+	echo "WARN: a previous test run has failed" 1>&2
+fi
+
+sysctl -q net.netfilter.nf_log_all_netns=1
+ip netns add ${ns1}
+ip netns add ${ns2}
+
+load_ruleset() {
+	local netns=$1
+
+ip netns exec ${netns} nft -f /dev/stdin <<EOF
+table inet filter {
+	chain prerouting {
+		type filter hook prerouting priority 0; policy accept;
+	        fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop
+	}
+}
+EOF
+}
+
+load_ruleset_count() {
+	local netns=$1
+
+ip netns exec ${netns} nft -f /dev/stdin <<EOF
+table inet filter {
+	chain prerouting {
+		type filter hook prerouting priority 0; policy accept;
+		ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop
+		ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop
+	}
+}
+EOF
+}
+
+check_drops() {
+	dmesg | grep -q ' nft_rpfilter: '
+	if [ $? -eq 0 ]; then
+		dmesg | grep ' nft_rpfilter: '
+		echo "FAIL: rpfilter did drop packets"
+		return 1
+	fi
+
+	return 0
+}
+
+check_fib_counter() {
+	local want=$1
+	local ns=$2
+	local address=$3
+
+	line=$(ip netns exec ${ns} nft list table inet filter | grep 'fib saddr . iif' | grep $address | grep "packets $want" )
+	ret=$?
+
+	if [ $ret -ne 0 ];then
+		echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2
+		ip netns exec ${ns} nft list table inet filter
+		return 1
+	fi
+
+	if [ $want -gt 0 ]; then
+		echo "PASS: fib expression did drop packets for $address"
+	fi
+
+	return 0
+}
+
+load_ruleset ${nsrouter}
+load_ruleset ${ns1}
+load_ruleset ${ns2}
+
+ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1
+if [ $? -ne 0 ];then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2}
+
+ip -net ${nsrouter} link set lo up
+ip -net ${nsrouter} link set veth0 up
+ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0
+ip -net ${nsrouter} addr add dead:1::1/64 dev veth0
+
+ip -net ${nsrouter} link set veth1 up
+ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1
+ip -net ${nsrouter} addr add dead:2::1/64 dev veth1
+
+ip -net ${ns1} link set lo up
+ip -net ${ns1} link set eth0 up
+
+ip -net ${ns2} link set lo up
+ip -net ${ns2} link set eth0 up
+
+ip -net ${ns1} addr add 10.0.1.99/24 dev eth0
+ip -net ${ns1} addr add dead:1::99/64 dev eth0
+ip -net ${ns1} route add default via 10.0.1.1
+ip -net ${ns1} route add default via dead:1::1
+
+ip -net ${ns2} addr add 10.0.2.99/24 dev eth0
+ip -net ${ns2} addr add dead:2::99/64 dev eth0
+ip -net ${ns2} route add default via 10.0.2.1
+ip -net ${ns2} route add default via dead:2::1
+
+test_ping() {
+  local daddr4=$1
+  local daddr6=$2
+
+  ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null
+  ret=$?
+  if [ $ret -ne 0 ];then
+	check_drops
+	echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2
+	return 1
+  fi
+
+  ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null
+  ret=$?
+  if [ $ret -ne 0 ];then
+	check_drops
+	echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2
+	return 1
+  fi
+
+  return 0
+}
+
+ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+sleep 3
+
+test_ping 10.0.2.1 dead:2::1 || exit 1
+check_drops || exit 1
+
+test_ping 10.0.2.99 dead:2::99 || exit 1
+check_drops || exit 1
+
+echo "PASS: fib expression did not cause unwanted packet drops"
+
+ip netns exec ${nsrouter} nft flush table inet filter
+
+ip -net ${ns1} route del default
+ip -net ${ns1} -6 route del default
+
+ip -net ${ns1} addr del 10.0.1.99/24 dev eth0
+ip -net ${ns1} addr del dead:1::99/64 dev eth0
+
+ip -net ${ns1} addr add 10.0.2.99/24 dev eth0
+ip -net ${ns1} addr add dead:2::99/64 dev eth0
+
+ip -net ${ns1} route add default via 10.0.2.1
+ip -net ${ns1} -6 route add default via dead:2::1
+
+ip -net ${nsrouter} addr add dead:2::1/64 dev veth0
+
+# switch to ruleset that doesn't log, this time
+# its expected that this does drop the packets.
+load_ruleset_count ${nsrouter}
+
+# ns1 has a default route, but nsrouter does not.
+# must not check return value, ping to 1.1.1.1 will
+# fail.
+check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1
+check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1
+
+ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null
+check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1
+
+sleep 2
+ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null
+check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1
+
+exit 0
-- 
cgit v1.2.3


From 584fd3b31889852d0d6f3dd1e3d8e9619b660d2c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 7 Jun 2021 11:45:58 +0200
Subject: objtool: Fix .symtab_shndx handling for elf_create_undef_symbol()

When an ELF object uses extended symbol section indexes (IOW it has a
.symtab_shndx section), these must be kept in sync with the regular
symbol table (.symtab).

So for every new symbol we emit, make sure to also emit a
.symtab_shndx value to keep the arrays of equal size.

Note: since we're writing an UNDEF symbol, most GElf_Sym fields will
be 0 and we can repurpose one (st_size) to host the 0 for the xshndx
value.

Fixes: 2f2f7e47f052 ("objtool: Add elf_create_undef_symbol()")
Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Suggested-by: Fangrui Song <maskray@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lkml.kernel.org/r/YL3q1qFO9QIRL/BA@hirez.programming.kicks-ass.net
---
 tools/objtool/elf.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 743c2e9d0f56..41bca1d13d8e 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -717,7 +717,7 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str)
 
 struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name)
 {
-	struct section *symtab;
+	struct section *symtab, *symtab_shndx;
 	struct symbol *sym;
 	Elf_Data *data;
 	Elf_Scn *s;
@@ -769,6 +769,29 @@ struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name)
 	symtab->len += data->d_size;
 	symtab->changed = true;
 
+	symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
+	if (symtab_shndx) {
+		s = elf_getscn(elf->elf, symtab_shndx->idx);
+		if (!s) {
+			WARN_ELF("elf_getscn");
+			return NULL;
+		}
+
+		data = elf_newdata(s);
+		if (!data) {
+			WARN_ELF("elf_newdata");
+			return NULL;
+		}
+
+		data->d_buf = &sym->sym.st_size; /* conveniently 0 */
+		data->d_size = sizeof(Elf32_Word);
+		data->d_align = 4;
+		data->d_type = ELF_T_WORD;
+
+		symtab_shndx->len += 4;
+		symtab_shndx->changed = true;
+	}
+
 	sym->sec = find_section_by_index(elf, 0);
 
 	elf_add_symbol(elf, sym);
-- 
cgit v1.2.3


From 95bf69b400f41fbba7a2dc49b0152dd7bdc9a508 Mon Sep 17 00:00:00 2001
From: Yanan Wang <wangyanan55@huawei.com>
Date: Thu, 10 Jun 2021 16:54:18 +0800
Subject: KVM: selftests: Fix compiling errors when initializing the static
 structure

Errors like below were produced from test_util.c when compiling the KVM
selftests on my local platform.

lib/test_util.c: In function 'vm_mem_backing_src_alias':
lib/test_util.c:177:12: error: initializer element is not constant
    .flag = anon_flags,
            ^~~~~~~~~~
lib/test_util.c:177:12: note: (near initialization for 'aliases[0].flag')

The reason is that we are using non-const expressions to initialize the
static structure, which will probably trigger a compiling error/warning
on stricter GCC versions. Fix it by converting the two const variables
"anon_flags" and "anon_huge_flags" into more stable macros.

Fixes: b3784bc28ccc0 ("KVM: selftests: refactor vm_mem_backing_src_type flags")
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Message-Id: <20210610085418.35544-1-wangyanan55@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/test_util.c | 38 ++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 6ad6c8276b2e..af1031fed97f 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -166,75 +166,75 @@ size_t get_def_hugetlb_pagesz(void)
 	return 0;
 }
 
+#define ANON_FLAGS	(MAP_PRIVATE | MAP_ANONYMOUS)
+#define ANON_HUGE_FLAGS	(ANON_FLAGS | MAP_HUGETLB)
+
 const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
 {
-	static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-	static const int anon_huge_flags = anon_flags | MAP_HUGETLB;
-
 	static const struct vm_mem_backing_src_alias aliases[] = {
 		[VM_MEM_SRC_ANONYMOUS] = {
 			.name = "anonymous",
-			.flag = anon_flags,
+			.flag = ANON_FLAGS,
 		},
 		[VM_MEM_SRC_ANONYMOUS_THP] = {
 			.name = "anonymous_thp",
-			.flag = anon_flags,
+			.flag = ANON_FLAGS,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
 			.name = "anonymous_hugetlb",
-			.flag = anon_huge_flags,
+			.flag = ANON_HUGE_FLAGS,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
 			.name = "anonymous_hugetlb_16kb",
-			.flag = anon_huge_flags | MAP_HUGE_16KB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_16KB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
 			.name = "anonymous_hugetlb_64kb",
-			.flag = anon_huge_flags | MAP_HUGE_64KB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_64KB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
 			.name = "anonymous_hugetlb_512kb",
-			.flag = anon_huge_flags | MAP_HUGE_512KB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_512KB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
 			.name = "anonymous_hugetlb_1mb",
-			.flag = anon_huge_flags | MAP_HUGE_1MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_1MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
 			.name = "anonymous_hugetlb_2mb",
-			.flag = anon_huge_flags | MAP_HUGE_2MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_2MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
 			.name = "anonymous_hugetlb_8mb",
-			.flag = anon_huge_flags | MAP_HUGE_8MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_8MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
 			.name = "anonymous_hugetlb_16mb",
-			.flag = anon_huge_flags | MAP_HUGE_16MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_16MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
 			.name = "anonymous_hugetlb_32mb",
-			.flag = anon_huge_flags | MAP_HUGE_32MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_32MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
 			.name = "anonymous_hugetlb_256mb",
-			.flag = anon_huge_flags | MAP_HUGE_256MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_256MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
 			.name = "anonymous_hugetlb_512mb",
-			.flag = anon_huge_flags | MAP_HUGE_512MB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_512MB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
 			.name = "anonymous_hugetlb_1gb",
-			.flag = anon_huge_flags | MAP_HUGE_1GB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_1GB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
 			.name = "anonymous_hugetlb_2gb",
-			.flag = anon_huge_flags | MAP_HUGE_2GB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_2GB,
 		},
 		[VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
 			.name = "anonymous_hugetlb_16gb",
-			.flag = anon_huge_flags | MAP_HUGE_16GB,
+			.flag = ANON_HUGE_FLAGS | MAP_HUGE_16GB,
 		},
 		[VM_MEM_SRC_SHMEM] = {
 			.name = "shmem",
-- 
cgit v1.2.3


From 2395da0e17935ce9158cdfae433962bdb6cbfa67 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 10 Jun 2021 15:59:43 -0700
Subject: selftests: mptcp: enable syncookie only in absence of reorders

Syncookie validation may fail for OoO packets, causing spurious
resets and self-tests failures, so let's force syncookie only
for tests iteration with no OoO.

Fixes: fed61c4b584c ("selftests: mptcp: make 2nd net namespace use tcp syn cookies unconditionally")
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/198
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 9ca5f1ba461e..2b495dc8d78e 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -197,9 +197,6 @@ ip -net "$ns4" link set ns4eth3 up
 ip -net "$ns4" route add default via 10.0.3.2
 ip -net "$ns4" route add default via dead:beef:3::2
 
-# use TCP syn cookies, even if no flooding was detected.
-ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2
-
 set_ethtool_flags() {
 	local ns="$1"
 	local dev="$2"
@@ -737,6 +734,14 @@ for sender in $ns1 $ns2 $ns3 $ns4;do
 		exit $ret
 	fi
 
+	# ns1<->ns2 is not subject to reordering/tc delays. Use it to test
+	# mptcp syncookie support.
+	if [ $sender = $ns1 ]; then
+		ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2
+	else
+		ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1
+	fi
+
 	run_tests "$ns2" $sender 10.0.1.2
 	run_tests "$ns2" $sender dead:beef:1::2
 	run_tests "$ns2" $sender 10.0.2.1
-- 
cgit v1.2.3


From 2d49b721dc18c113d5221f4cf5a6104eb66cb7f2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 Jun 2021 09:04:29 +0200
Subject: objtool: Only rewrite unconditional retpoline thunk calls

It turns out that the compilers generate conditional branches to the
retpoline thunks like:

  5d5:   0f 85 00 00 00 00       jne    5db <cpuidle_reflect+0x22>
	5d7: R_X86_64_PLT32     __x86_indirect_thunk_r11-0x4

while the rewrite can only handle JMP/CALL to the thunks. The result
is the alternative wrecking the code. Make sure to skip writing the
alternatives for conditional branches.

Fixes: 9bc0bb50727c ("objtool/x86: Rewrite retpoline thunk calls")
Reported-by: Lukasz Majczak <lma@semihalf.com>
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
---
 tools/objtool/arch/x86/decode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 24295d39713b..523aa4157f80 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -747,6 +747,10 @@ int arch_rewrite_retpolines(struct objtool_file *file)
 
 	list_for_each_entry(insn, &file->retpoline_call_list, call_node) {
 
+		if (insn->type != INSN_JUMP_DYNAMIC &&
+		    insn->type != INSN_CALL_DYNAMIC)
+			continue;
+
 		if (!strcmp(insn->sec->name, ".text.__x86.indirect_thunk"))
 			continue;
 
-- 
cgit v1.2.3


From 197eecb6ecae0b04bd694432f640ff75597fed9c Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Sat, 5 Jun 2021 13:29:57 +0800
Subject: perf session: Correct buffer copying when peeking events

When peeking an event, it has a short path and a long path.  The short
path uses the session pointer "one_mmap_addr" to directly fetch the
event; and the long path needs to read out the event header and the
following event data from file and fill into the buffer pointer passed
through the argument "buf".

The issue is in the long path that it copies the event header and event
data into the same destination address which pointer "buf", this means
the event header is overwritten.  We are just lucky to run into the
short path in most cases, so we don't hit the issue in the long path.

This patch adds the offset "hdr_sz" to the pointer "buf" when copying
the event data, so that it can reserve the event header which can be
used properly by its caller.

Fixes: 5a52f33adf02 ("perf session: Add perf_session__peek_event()")
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20210605052957.1070720-1-leo.yan@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 106b3d60881a..e59242c361ce 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1723,6 +1723,7 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
 	if (event->header.size < hdr_sz || event->header.size > buf_sz)
 		return -1;
 
+	buf += hdr_sz;
 	rest = event->header.size - hdr_sz;
 
 	if (readn(fd, buf, rest) != (ssize_t)rest)
-- 
cgit v1.2.3


From 36524112aba3246d1240c1791c72b26fa54008a3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 8 Jun 2021 13:46:18 -0300
Subject: tools headers cpufeatures: Sync with the kernel sources

To pick the changes in:

  fb35d30fe5b06cc2 ("x86/cpufeatures: Assign dedicated feature word for CPUID_0x8000001F[EAX]")
  e7b6385b01d8e9fb ("x86/cpufeatures: Add Intel SGX hardware bits")
  1478b99a76534b6c ("x86/cpufeatures: Mark ENQCMD as disabled when configured out")

That don't cause any change in the tools, just silences this perf build
warning:

  Warning: Kernel ABI header at 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version at 'arch/x86/include/asm/disabled-features.h'
  diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h

Cc: Borislav Petkov <bp@suse.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/disabled-features.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index b7dd944dc867..8f28fafa98b3 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -56,11 +56,8 @@
 # define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
 #endif
 
-#ifdef CONFIG_IOMMU_SUPPORT
-# define DISABLE_ENQCMD	0
-#else
-# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
-#endif
+/* Force disable because it's broken beyond repair */
+#define DISABLE_ENQCMD		(1 << (X86_FEATURE_ENQCMD & 31))
 
 #ifdef CONFIG_X86_SGX
 # define DISABLE_SGX	0
-- 
cgit v1.2.3


From b87b04f5019e821c8c6c7761f258402e43500a1f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@kernel.org>
Date: Sat, 12 Jun 2021 18:24:59 -0600
Subject: ipv4: Fix device used for dst_alloc with local routes

Oliver reported a use case where deleting a VRF device can hang
waiting for the refcnt to drop to 0. The root cause is that the dst
is allocated against the VRF device but cached on the loopback
device.

The use case (added to the selftests) has an implicit VRF crossing
due to the ordering of the FIB rules (lookup local is before the
l3mdev rule, but the problem occurs even if the FIB rules are
re-ordered with local after l3mdev because the VRF table does not
have a default route to terminate the lookup). The end result is
is that the FIB lookup returns the loopback device as the nexthop,
but the ingress device is in a VRF. The mismatch causes the dst
alloc against the VRF device but then cached on the loopback.

The fix is to bring the trick used for IPv6 (see ip6_rt_get_dev_rcu):
pick the dst alloc device based the fib lookup result but with checks
that the result has a nexthop device (e.g., not an unreachable or
prohibit entry).

Fixes: f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev if relevant")
Reported-by: Oliver Herms <oliver.peter.herms@gmail.com>
Signed-off-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c                         | 15 ++++++++++++++-
 tools/testing/selftests/net/fib_tests.sh | 25 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f6787c55f6ab..6a36ac98476f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2056,6 +2056,19 @@ martian_source:
 	return err;
 }
 
+/* get device for dst_alloc with local routes */
+static struct net_device *ip_rt_get_dev(struct net *net,
+					const struct fib_result *res)
+{
+	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
+	struct net_device *dev = NULL;
+
+	if (nhc)
+		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
+
+	return dev ? : net->loopback_dev;
+}
+
 /*
  *	NOTE. We drop all the packets that has local source
  *	addresses, because every properly looped back packet
@@ -2212,7 +2225,7 @@ local_input:
 		}
 	}
 
-	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
+	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
 			   flags | RTCF_LOCAL, res->type,
 			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
 	if (!rth)
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 76d9487fb03c..5abe92d55b69 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -1384,12 +1384,37 @@ ipv4_rt_replace()
 	ipv4_rt_replace_mpath
 }
 
+# checks that cached input route on VRF port is deleted
+# when VRF is deleted
+ipv4_local_rt_cache()
+{
+	run_cmd "ip addr add 10.0.0.1/32 dev lo"
+	run_cmd "ip netns add test-ns"
+	run_cmd "ip link add veth-outside type veth peer name veth-inside"
+	run_cmd "ip link add vrf-100 type vrf table 1100"
+	run_cmd "ip link set veth-outside master vrf-100"
+	run_cmd "ip link set veth-inside netns test-ns"
+	run_cmd "ip link set veth-outside up"
+	run_cmd "ip link set vrf-100 up"
+	run_cmd "ip route add 10.1.1.1/32 dev veth-outside table 1100"
+	run_cmd "ip netns exec test-ns ip link set veth-inside up"
+	run_cmd "ip netns exec test-ns ip addr add 10.1.1.1/32 dev veth-inside"
+	run_cmd "ip netns exec test-ns ip route add 10.0.0.1/32 dev veth-inside"
+	run_cmd "ip netns exec test-ns ip route add default via 10.0.0.1"
+	run_cmd "ip netns exec test-ns ping 10.0.0.1 -c 1 -i 1"
+	run_cmd "ip link delete vrf-100"
+
+	# if we do not hang test is a success
+	log_test $? 0 "Cached route removed from VRF port device"
+}
+
 ipv4_route_test()
 {
 	route_setup
 
 	ipv4_rt_add
 	ipv4_rt_replace
+	ipv4_local_rt_cache
 
 	route_cleanup
 }
-- 
cgit v1.2.3


From 973377ffe8148180b2651825b92ae91988141b05 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 31 May 2021 12:34:24 +0000
Subject: bpf, selftests: Adjust few selftest outcomes wrt unreachable code

In almost all cases from test_verifier that have been changed in here, we've
had an unreachable path with a load from a register which has an invalid
address on purpose. This was basically to make sure that we never walk this
path and to have the verifier complain if it would otherwise. Change it to
match on the right error for unprivileged given we now test these paths
under speculative execution.

There's one case where we match on exact # of insns_processed. Due to the
extra path, this will of course mismatch on unprivileged. Thus, restrict the
test->insn_processed check to privileged-only.

In one other case, we result in a 'pointer comparison prohibited' error. This
is similarly due to verifying an 'invalid' branch where we end up with a value
pointer on one side of the comparison.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c        |  2 +-
 tools/testing/selftests/bpf/verifier/and.c         |  2 ++
 tools/testing/selftests/bpf/verifier/bounds.c      | 14 ++++++++++++++
 tools/testing/selftests/bpf/verifier/dead_code.c   |  2 ++
 tools/testing/selftests/bpf/verifier/jmp32.c       | 22 ++++++++++++++++++++++
 tools/testing/selftests/bpf/verifier/jset.c        | 10 ++++++----
 tools/testing/selftests/bpf/verifier/unpriv.c      |  2 ++
 .../selftests/bpf/verifier/value_ptr_arith.c       |  7 ++++---
 8 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 1512092e1e68..3a9e332c5e36 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1147,7 +1147,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		}
 	}
 
-	if (test->insn_processed) {
+	if (!unpriv && test->insn_processed) {
 		uint32_t insn_processed;
 		char *proc;
 
diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c
index ca8fdb1b3f01..7d7ebee5cc7a 100644
--- a/tools/testing/selftests/bpf/verifier/and.c
+++ b/tools/testing/selftests/bpf/verifier/and.c
@@ -61,6 +61,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R1 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 0
 },
diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c
index 8a1caf46ffbc..e061e8799ce2 100644
--- a/tools/testing/selftests/bpf/verifier/bounds.c
+++ b/tools/testing/selftests/bpf/verifier/bounds.c
@@ -508,6 +508,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT
 },
 {
@@ -528,6 +530,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT
 },
 {
@@ -569,6 +573,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 min value is outside of the allowed memory range",
+	.result_unpriv = REJECT,
 	.fixup_map_hash_8b = { 3 },
 	.result = ACCEPT,
 },
@@ -589,6 +595,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 min value is outside of the allowed memory range",
+	.result_unpriv = REJECT,
 	.fixup_map_hash_8b = { 3 },
 	.result = ACCEPT,
 },
@@ -609,6 +617,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 min value is outside of the allowed memory range",
+	.result_unpriv = REJECT,
 	.fixup_map_hash_8b = { 3 },
 	.result = ACCEPT,
 },
@@ -674,6 +684,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 min value is outside of the allowed memory range",
+	.result_unpriv = REJECT,
 	.fixup_map_hash_8b = { 3 },
 	.result = ACCEPT,
 },
@@ -695,6 +707,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 min value is outside of the allowed memory range",
+	.result_unpriv = REJECT,
 	.fixup_map_hash_8b = { 3 },
 	.result = ACCEPT,
 },
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index 17fe33a75034..2c8935b3e65d 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -8,6 +8,8 @@
 	BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
 },
diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c
index bd5cae4a7f73..1c857b2fbdf0 100644
--- a/tools/testing/selftests/bpf/verifier/jmp32.c
+++ b/tools/testing/selftests/bpf/verifier/jmp32.c
@@ -87,6 +87,8 @@
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -150,6 +152,8 @@
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -213,6 +217,8 @@
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -280,6 +286,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -348,6 +356,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -416,6 +426,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -484,6 +496,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -552,6 +566,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -620,6 +636,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -688,6 +706,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
@@ -756,6 +776,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 },
diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c
index 8dcd4e0383d5..11fc68da735e 100644
--- a/tools/testing/selftests/bpf/verifier/jset.c
+++ b/tools/testing/selftests/bpf/verifier/jset.c
@@ -82,8 +82,8 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.retval_unpriv = 1,
-	.result_unpriv = ACCEPT,
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.retval = 1,
 	.result = ACCEPT,
 },
@@ -141,7 +141,8 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.result_unpriv = ACCEPT,
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -162,6 +163,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.result_unpriv = ACCEPT,
+	.errstr_unpriv = "R9 !read_ok",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c
index bd436df5cc32..111801aea5e3 100644
--- a/tools/testing/selftests/bpf/verifier/unpriv.c
+++ b/tools/testing/selftests/bpf/verifier/unpriv.c
@@ -420,6 +420,8 @@
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
 	BPF_EXIT_INSN(),
 	},
+	.errstr_unpriv = "R7 invalid mem access 'inv'",
+	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 0,
 },
diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
index 7ae2859d495c..a3e593ddfafc 100644
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -120,7 +120,7 @@
 	.fixup_map_array_48b = { 1 },
 	.result = ACCEPT,
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "R2 tried to add from different maps, paths or scalars",
+	.errstr_unpriv = "R2 pointer comparison prohibited",
 	.retval = 0,
 },
 {
@@ -159,7 +159,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	// fake-dead code; targeted from branch A to
-	// prevent dead code sanitization
+	// prevent dead code sanitization, rejected
+	// via branch B however
 	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
@@ -167,7 +168,7 @@
 	.fixup_map_array_48b = { 1 },
 	.result = ACCEPT,
 	.result_unpriv = REJECT,
-	.errstr_unpriv = "R2 tried to add from different maps, paths or scalars",
+	.errstr_unpriv = "R0 invalid mem access 'inv'",
 	.retval = 0,
 },
 {
-- 
cgit v1.2.3


From 0fd158b89b50b3a31c97a639ff496e1c59686e97 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 16 Jun 2021 16:03:21 +0200
Subject: selftests: net: veth: make test compatible with dash

veth.sh is a shell script that uses /bin/sh; some distro (Ubuntu for
example) use dash as /bin/sh and in this case the test reports the
following error:

 # ./veth.sh: 21: local: -r: bad variable name
 # ./veth.sh: 21: local: -r: bad variable name

This happens because dash doesn't support the option "-r" with local.

Moreover, in case of missing bpf object, the script is exiting -1, that
is an illegal number for dash:

 exit: Illegal number: -1

Change the script to be compatible both with bash and dash and prevent
the errors above.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/veth.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh
index 2fedc0781ce8..11d7cdb898c0 100755
--- a/tools/testing/selftests/net/veth.sh
+++ b/tools/testing/selftests/net/veth.sh
@@ -18,7 +18,8 @@ ret=0
 
 cleanup() {
 	local ns
-	local -r jobs="$(jobs -p)"
+	local jobs
+	readonly jobs="$(jobs -p)"
 	[ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
 	rm -f $STATS
 
@@ -108,7 +109,7 @@ chk_gro() {
 
 if [ ! -f ../bpf/xdp_dummy.o ]; then
 	echo "Missing xdp_dummy helper. Build bpf selftest first"
-	exit -1
+	exit 1
 fi
 
 create_ns
-- 
cgit v1.2.3


From 1b29df0e2e802cb15a5196c936f494161ec97502 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 16 Jun 2021 16:57:27 +0200
Subject: selftests: net: use bash to run udpgro_fwd test case

udpgro_fwd.sh contains many bash specific operators ("[[", "local -r"),
but it's using /bin/sh; in some distro /bin/sh is mapped to /bin/dash,
that doesn't support such operators.

Force the test to use /bin/bash explicitly and prevent false positive
test failures.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/udpgro_fwd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index a8fa64136282..7f26591f236b 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
 readonly BASE="ns-$(mktemp -u XXXXXX)"
-- 
cgit v1.2.3


From d8ac05ea13d789d5491a5920d70a05659015441d Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 15 Jun 2021 16:04:43 +0100
Subject: KVM: selftests: Fix kvm_check_cap() assertion

KVM_CHECK_EXTENSION ioctl can return any negative value on error,
and not necessarily -1. Change the assertion to reflect that.

Signed-off-by: Fuad Tabba <tabba@google.com>
Message-Id: <20210615150443.1183365-1-tabba@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 5c70596dd1b9..a2b732cf96ea 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -82,7 +82,7 @@ int kvm_check_cap(long cap)
 
 	kvm_fd = open_kvm_dev_path_or_exit();
 	ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
-	TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n"
+	TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n"
 		"  rc: %i errno: %i", ret, errno);
 
 	close(kvm_fd);
-- 
cgit v1.2.3


From 7e9838b7915e29ae0dfe4a3e5f007c9dc6ab9b45 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Fri, 18 Jun 2021 13:04:36 +0200
Subject: selftests/net: Add icmp.sh for testing ICMP dummy address responses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a new icmp.sh selftest for testing that the kernel will respond
correctly with an ICMP unreachable message with the dummy (192.0.0.8)
source address when there are no IPv4 addresses configured to use as source
addresses.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/icmp.sh | 74 +++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100755 tools/testing/selftests/net/icmp.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/icmp.sh b/tools/testing/selftests/net/icmp.sh
new file mode 100755
index 000000000000..e4b04cd1644a
--- /dev/null
+++ b/tools/testing/selftests/net/icmp.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for checking ICMP response with dummy address instead of 0.0.0.0.
+# Sets up two namespaces like:
+# +----------------------+                          +--------------------+
+# | ns1                  |    v4-via-v6 routes:     | ns2                |
+# |                      |                  '       |                    |
+# |             +--------+   -> 172.16.1.0/24 ->    +--------+           |
+# |             | veth0  +--------------------------+  veth0 |           |
+# |             +--------+   <- 172.16.0.0/24 <-    +--------+           |
+# |           172.16.0.1 |                          | 2001:db8:1::2/64   |
+# |     2001:db8:1::2/64 |                          |                    |
+# +----------------------+                          +--------------------+
+#
+# And then tries to ping 172.16.1.1 from ns1. This results in a "net
+# unreachable" message being sent from ns2, but there is no IPv4 address set in
+# that address space, so the kernel should substitute the dummy address
+# 192.0.0.8 defined in RFC7600.
+
+NS1=ns1
+NS2=ns2
+H1_IP=172.16.0.1/32
+H1_IP6=2001:db8:1::1
+RT1=172.16.1.0/24
+PINGADDR=172.16.1.1
+RT2=172.16.0.0/24
+H2_IP6=2001:db8:1::2
+
+TMPFILE=$(mktemp)
+
+cleanup()
+{
+    rm -f "$TMPFILE"
+    ip netns del $NS1
+    ip netns del $NS2
+}
+
+trap cleanup EXIT
+
+# Namespaces
+ip netns add $NS1
+ip netns add $NS2
+
+# Connectivity
+ip -netns $NS1 link add veth0 type veth peer name veth0 netns $NS2
+ip -netns $NS1 link set dev veth0 up
+ip -netns $NS2 link set dev veth0 up
+ip -netns $NS1 addr add $H1_IP dev veth0
+ip -netns $NS1 addr add $H1_IP6/64 dev veth0 nodad
+ip -netns $NS2 addr add $H2_IP6/64 dev veth0 nodad
+ip -netns $NS1 route add $RT1 via inet6 $H2_IP6
+ip -netns $NS2 route add $RT2 via inet6 $H1_IP6
+
+# Make sure ns2 will respond with ICMP unreachable
+ip netns exec $NS2 sysctl -qw net.ipv4.icmp_ratelimit=0 net.ipv4.ip_forward=1
+
+# Run the test - a ping runs in the background, and we capture ICMP responses
+# with tcpdump; -c 1 means it should exit on the first ping, but add a timeout
+# in case something goes wrong
+ip netns exec $NS1 ping -w 3 -i 0.5 $PINGADDR >/dev/null &
+ip netns exec $NS1 timeout 10 tcpdump -tpni veth0 -c 1 'icmp and icmp[icmptype] != icmp-echo' > $TMPFILE 2>/dev/null
+
+# Parse response and check for dummy address
+# tcpdump output looks like:
+# IP 192.0.0.8 > 172.16.0.1: ICMP net 172.16.1.1 unreachable, length 92
+RESP_IP=$(awk '{print $2}' < $TMPFILE)
+if [[ "$RESP_IP" != "192.0.0.8" ]]; then
+    echo "FAIL - got ICMP response from $RESP_IP, should be 192.0.0.8"
+    exit 1
+else
+    echo "OK"
+    exit 0
+fi
-- 
cgit v1.2.3