diff options
Diffstat (limited to 'tools/testing/selftests/mm/userfaultfd.c')
-rw-r--r-- | tools/testing/selftests/mm/userfaultfd.c | 1858 |
1 files changed, 0 insertions, 1858 deletions
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c deleted file mode 100644 index 7f22844ed704..000000000000 --- a/tools/testing/selftests/mm/userfaultfd.c +++ /dev/null @@ -1,1858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Stress userfaultfd syscall. - * - * Copyright (C) 2015 Red Hat, Inc. - * - * This test allocates two virtual areas and bounces the physical - * memory across the two virtual areas (from area_src to area_dst) - * using userfaultfd. - * - * There are three threads running per CPU: - * - * 1) one per-CPU thread takes a per-page pthread_mutex in a random - * page of the area_dst (while the physical page may still be in - * area_src), and increments a per-page counter in the same page, - * and checks its value against a verification region. - * - * 2) another per-CPU thread handles the userfaults generated by - * thread 1 above. userfaultfd blocking reads or poll() modes are - * exercised interleaved. - * - * 3) one last per-CPU thread transfers the memory in the background - * at maximum bandwidth (if not already transferred by thread - * 2). Each cpu thread takes cares of transferring a portion of the - * area. - * - * When all threads of type 3 completed the transfer, one bounce is - * complete. area_src and area_dst are then swapped. All threads are - * respawned and so the bounce is immediately restarted in the - * opposite direction. - * - * per-CPU threads 1 by triggering userfaults inside - * pthread_mutex_lock will also verify the atomicity of the memory - * transfer (UFFDIO_COPY). - */ - -#define _GNU_SOURCE -#include <stdio.h> -#include <errno.h> -#include <unistd.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <time.h> -#include <signal.h> -#include <poll.h> -#include <string.h> -#include <linux/mman.h> -#include <sys/mman.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/wait.h> -#include <pthread.h> -#include <linux/userfaultfd.h> -#include <setjmp.h> -#include <stdbool.h> -#include <assert.h> -#include <inttypes.h> -#include <stdint.h> -#include <sys/random.h> - -#include "../kselftest.h" -#include "vm_util.h" - -#ifdef __NR_userfaultfd - -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; - -#define BOUNCE_RANDOM (1<<0) -#define BOUNCE_RACINGFAULTS (1<<1) -#define BOUNCE_VERIFY (1<<2) -#define BOUNCE_POLL (1<<3) -static int bounces; - -#define TEST_ANON 1 -#define TEST_HUGETLB 2 -#define TEST_SHMEM 3 -static int test_type; - -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) - -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ -static bool test_dev_userfaultfd; - -/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ -#define ALARM_INTERVAL_SECS 10 -static volatile bool test_uffdio_copy_eexist = true; -static volatile bool test_uffdio_zeropage_eexist = true; -/* Whether to test uffd write-protection */ -static bool test_uffdio_wp = true; -/* Whether to test uffd minor faults */ -static bool test_uffdio_minor = false; -static bool map_shared; -static int mem_fd; -static unsigned long long *count_verify; -static int uffd = -1; -static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -static char *zeropage; -pthread_attr_t attr; -static bool test_collapse; - -/* Userfaultfd test statistics */ -struct uffd_stats { - int cpu; - unsigned long missing_faults; - unsigned long wp_faults; - unsigned long minor_faults; -}; - -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) - -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) - -const char *examples = - "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" - "./userfaultfd anon 100 99999\n\n" - "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" - "./userfaultfd anon:dev 100 99999\n\n" - "# Run share memory test on 1GiB region with 99 bounces:\n" - "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" - "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50\n\n" - "# 10MiB-~6GiB 999 bounces anonymous test, " - "continue forever unless an error triggers\n" - "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " - "[hugetlbfs_file]\n\n"); - fprintf(stderr, "Supported <test type>: anon, hugetlb, " - "hugetlb_shared, shmem\n\n"); - fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " - "Supported mods:\n"); - fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); - fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" - "memory\n"); - fprintf(stderr, "\nExample test mod usage:\n"); - fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); - fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); - - fprintf(stderr, "Examples:\n\n"); - fprintf(stderr, "%s", examples); - exit(1); -} - -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, line=%d)\n", \ - ret, __LINE__); \ - } while (0) - -#define errexit(exitcode, fmt, ...) \ - do { \ - _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ - } while (0) - -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) - -static void uffd_stats_reset(struct uffd_stats *uffd_stats, - unsigned long n_cpus) -{ - int i; - - for (i = 0; i < n_cpus; i++) { - uffd_stats[i].cpu = i; - uffd_stats[i].missing_faults = 0; - uffd_stats[i].wp_faults = 0; - uffd_stats[i].minor_faults = 0; - } -} - -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) -{ - int i; - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; - - for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; - } - - printf("userfaults: "); - if (miss_total) { - printf("%llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b) "); - } - if (wp_total) { - printf("%llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b) "); - } - if (minor_total) { - printf("%llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)"); - } - printf("\n"); -} - -static void anon_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void anon_allocate_area(void **alloc_area, bool is_src) -{ - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); -} - -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ -} - -static void hugetlb_release_pages(char *rel_area) -{ - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); - } -} - -static void hugetlb_allocate_area(void **alloc_area, bool is_src) -{ - off_t size = nr_pages * page_size; - off_t offset = is_src ? 0 : size; - void *area_alias = NULL; - char **alloc_area_alias; - - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - (is_src ? 0 : MAP_NORESERVE), - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); - - if (map_shared) { - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); - } - - if (is_src) { - alloc_area_alias = &area_src_alias; - } else { - alloc_area_alias = &area_dst_alias; - } - if (area_alias) - *alloc_area_alias = area_alias; -} - -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - if (!map_shared) - return; - - *start = (unsigned long) area_dst_alias + offset; -} - -static void shmem_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); -} - -static void shmem_allocate_area(void **alloc_area, bool is_src) -{ - void *area_alias = NULL; - size_t bytes = nr_pages * page_size; - unsigned long offset = is_src ? 0 : bytes; - char *p = NULL, *p_alias = NULL; - - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } - - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); - if (test_collapse && *alloc_area != p) - err("mmap of memfd failed at %p", p); - - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); - if (test_collapse && area_alias != p_alias) - err("mmap of anonymous memory failed at %p", p_alias); - - if (is_src) - area_src_alias = area_alias; - else - area_dst_alias = area_alias; -} - -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - *start = (unsigned long)area_dst_alias + offset; -} - -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) -{ - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) - err("Did not find expected %d number of hugepages", - expect_nr_hpages); -} - -struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); -}; - -static struct uffd_test_ops anon_uffd_test_ops = { - .allocate_area = anon_allocate_area, - .release_pages = anon_release_pages, - .alias_mapping = noop_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops shmem_uffd_test_ops = { - .allocate_area = shmem_allocate_area, - .release_pages = shmem_release_pages, - .alias_mapping = shmem_alias_mapping, - .check_pmd_mapping = shmem_check_pmd_mapping, -}; - -static struct uffd_test_ops hugetlb_uffd_test_ops = { - .allocate_area = hugetlb_allocate_area, - .release_pages = hugetlb_release_pages, - .alias_mapping = hugetlb_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops *uffd_test_ops; - -static inline uint64_t uffd_minor_feature(void) -{ - if (test_type == TEST_HUGETLB && map_shared) - return UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - return UFFD_FEATURE_MINOR_SHMEM; - else - return 0; -} - -static uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - -static void userfaultfd_open(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); - if (uffdio_api.api != UFFD_API) - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); - - *features = uffdio_api.features; -} - -static inline void munmap_area(void **area) -{ - if (*area) - if (munmap(*area, nr_pages * page_size)) - err("munmap"); - - *area = NULL; -} - -static void uffd_test_ctx_clear(void) -{ - size_t i; - - if (pipefd) { - for (i = 0; i < nr_cpus * 2; ++i) { - if (close(pipefd[i])) - err("close pipefd"); - } - free(pipefd); - pipefd = NULL; - } - - if (count_verify) { - free(count_verify); - count_verify = NULL; - } - - if (uffd != -1) { - if (close(uffd)) - err("close uffd"); - uffd = -1; - } - - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); -} - -static void uffd_test_ctx_init(uint64_t features) -{ - unsigned long nr, cpu; - - uffd_test_ctx_clear(); - - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); - - userfaultfd_open(&features); - - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) - err("count_verify"); - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - /* - * After initialization of area_src, we must explicitly release pages - * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, - * hence we could hit memory corruption later in the test. - * - * One example is when THP is globally enabled, above allocate_area() - * calls could have the two areas merged into a single VMA (as they - * will have the same VMA flags so they're mergeable). When we - * initialize the area_src above, it's possible that some part of - * area_dst could have been faulted in via one huge THP that will be - * shared between area_src and area_dst. It could cause some of the - * area_dst won't be trapped by missing userfaults. - * - * This release_pages() will guarantee even if that happened, we'll - * proactively split the thp and drop any accidentally initialized - * pages within area_dst. - */ - uffd_test_ops->release_pages(area_dst); - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) - err("pipefd"); - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) - err("pipe"); -} - -static int my_bcmp(char *str1, char *str2, size_t n) -{ - unsigned long i; - for (i = 0; i < n; i++) - if (str1[i] != str2[i]) - return 1; - return 0; -} - -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) -{ - struct uffdio_writeprotect prms; - - /* Write protection page faults */ - prms.range.start = start; - prms.range.len = len; - /* Undo write-protect, do wakeup after that */ - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); -} - -static void continue_range(int ufd, __u64 start, __u64 len) -{ - struct uffdio_continue req; - int ret; - - req.range.start = start; - req.range.len = len; - req.mode = 0; - - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, - (uint64_t)start); - - /* - * Error handling within the kernel for continue is subtly different - * from copy or zeropage, so it may be a source of bugs. Trigger an - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. - */ - req.mapped = 0; - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); - if (ret >= 0 || req.mapped != -EEXIST) - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, - ret, (int64_t) req.mapped); -} - -static void *locking_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; - unsigned long long count; - - if (!(bounces & BOUNCE_RANDOM)) { - page_nr = -bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - page_nr += cpu * nr_pages_per_cpu; - } - - while (!finished) { - if (bounces & BOUNCE_RANDOM) { - if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) - err("getrandom failed"); - } else - page_nr += 1; - page_nr %= nr_pages; - pthread_mutex_lock(area_mutex(area_dst, page_nr)); - count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) - err("page_nr %lu memory corruption %llu %llu", - page_nr, count, count_verify[page_nr]); - count++; - *area_count(area_dst, page_nr) = count_verify[page_nr] = count; - pthread_mutex_unlock(area_mutex(area_dst, page_nr)); - } - - return NULL; -} - -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_copy->dst, - uffdio_copy->len, - offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) - err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); - } else { - err("UFFDIO_COPY retry unexpected: %"PRId64, - (int64_t)uffdio_copy->copy); - } -} - -static void wake_range(int ufd, unsigned long addr, unsigned long len) -{ - struct uffdio_range uffdio_wake; - - uffdio_wake.start = addr; - uffdio_wake.len = len; - - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) - fprintf(stderr, "error waking %lu\n", - addr), exit(1); -} - -static int __copy_page(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_copy uffdio_copy; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; - if (test_uffdio_wp) - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; - else - uffdio_copy.mode = 0; - uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) - err("UFFDIO_COPY error: %"PRId64, - (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); - } - return 1; - } - return 0; -} - -static int copy_page_retry(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, true); -} - -static int copy_page(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, false); -} - -static int uffd_read_msg(int ufd, struct uffd_msg *msg) -{ - int ret = read(uffd, msg, sizeof(*msg)); - - if (ret != sizeof(*msg)) { - if (ret < 0) { - if (errno == EAGAIN || errno == EINTR) - return 1; - err("blocking read error"); - } else { - err("short read"); - } - } - - return 0; -} - -static void uffd_handle_page_fault(struct uffd_msg *msg, - struct uffd_stats *stats) -{ - unsigned long offset; - - if (msg->event != UFFD_EVENT_PAGEFAULT) - err("unexpected msg event %u", msg->event); - - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { - /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { - uint8_t *area; - int b; - - /* - * Minor page faults - * - * To prove we can modify the original range for testing - * purposes, we're going to bit flip this range before - * continuing. - * - * Note that this requires all minor page fault tests operate on - * area_dst (non-UFFD-registered) and area_dst_alias - * (UFFD-registered). - */ - - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) - area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; - } else { - /* - * Missing page faults. - * - * Here we force a write check for each of the missing mode - * faults. It's guaranteed because the only threads that - * will trigger uffd faults are the locking threads, and - * their first instruction to touch the missing page will - * always be pthread_mutex_lock(). - * - * Note that here we relied on an NPTL glibc impl detail to - * always read the lock type at the entry of the lock op - * (pthread_mutex_t.__data.__type, offset 0x10) before - * doing any locking operations to guarantee that. It's - * actually not good to rely on this impl detail because - * logically a pthread-compatible lib can implement the - * locks without types and we can fail when linking with - * them. However since we used to find bugs with this - * strict check we still keep it around. Hopefully this - * could be a good hint when it fails again. If one day - * it'll break on some other impl of glibc we'll revisit. - */ - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - err("unexpected write fault"); - - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); - - if (copy_page(uffd, offset)) - stats->missing_faults++; - } -} - -static void *uffd_poll_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; - struct pollfd pollfd[2]; - struct uffd_msg msg; - struct uffdio_register uffd_reg; - int ret; - char tmp_chr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; - pollfd[1].events = POLLIN; - - for (;;) { - ret = poll(pollfd, 2, -1); - if (ret <= 0) { - if (errno == EINTR || errno == EAGAIN) - continue; - err("poll error: %d", ret); - } - if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - err("read pipefd error"); - break; - } - if (!(pollfd[0].revents & POLLIN)) - err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) - continue; - switch (msg.event) { - default: - err("unexpected msg event %u\n", msg.event); - break; - case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); - break; - case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; - break; - case UFFD_EVENT_REMOVE: - uffd_reg.range.start = msg.arg.remove.start; - uffd_reg.range.len = msg.arg.remove.end - - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - err("remove failure"); - break; - case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; - break; - } - } - - return NULL; -} - -pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; - -static void *uffd_read_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - struct uffd_msg msg; - - pthread_mutex_unlock(&uffd_read_mutex); - /* from here cancellation is ok */ - - for (;;) { - if (uffd_read_msg(uffd, &msg)) - continue; - uffd_handle_page_fault(&msg, stats); - } - - return NULL; -} - -static void *background_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr, start_nr, mid_nr, end_nr; - - start_nr = cpu * nr_pages_per_cpu; - end_nr = (cpu+1) * nr_pages_per_cpu; - mid_nr = (start_nr + end_nr) / 2; - - /* Copy the first half of the pages */ - for (page_nr = start_nr; page_nr < mid_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - /* - * If we need to test uffd-wp, set it up now. Then we'll have - * at least the first half of the pages mapped already which - * can be write-protected for testing - */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, - nr_pages_per_cpu * page_size, true); - - /* - * Continue the 2nd half of the page copying, handling write - * protection faults if any - */ - for (page_nr = mid_nr; page_nr < end_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - return NULL; -} - -static int stress(struct uffd_stats *uffd_stats) -{ - unsigned long cpu; - pthread_t locking_threads[nr_cpus]; - pthread_t uffd_threads[nr_cpus]; - pthread_t background_threads[nr_cpus]; - - finished = 0; - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pthread_create(&locking_threads[cpu], &attr, - locking_thread, (void *)cpu)) - return 1; - if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_read_thread, - (void *)&uffd_stats[cpu])) - return 1; - pthread_mutex_lock(&uffd_read_mutex); - } - if (pthread_create(&background_threads[cpu], &attr, - background_thread, (void *)cpu)) - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(background_threads[cpu], NULL)) - return 1; - - /* - * Be strict and immediately zap area_src, the whole area has - * been transferred already by the background treads. The - * area_src could then be faulted in a racy way by still - * running uffdio_threads reading zeropages after we zapped - * area_src (but they're guaranteed to get -EEXIST from - * UFFDIO_COPY without writing zero pages into area_dst - * because the background threads already completed). - */ - uffd_test_ops->release_pages(area_src); - - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - char c; - if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) - err("pipefd write error"); - if (pthread_join(uffd_threads[cpu], - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_cancel(uffd_threads[cpu])) - return 1; - if (pthread_join(uffd_threads[cpu], NULL)) - return 1; - } - } - - return 0; -} - -sigjmp_buf jbuf, *sigbuf; - -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) -{ - if (sig == SIGBUS) { - if (sigbuf) - siglongjmp(*sigbuf, 1); - abort(); - } -} - -/* - * For non-cooperative userfaultfd test we fork() a process that will - * generate pagefaults, will mremap the area monitored by the - * userfaultfd and at last this process will release the monitored - * area. - * For the anonymous and shared memory the area is divided into two - * parts, the first part is accessed before mremap, and the second - * part is accessed after mremap. Since hugetlbfs does not support - * mremap, the entire monitored area is accessed in a single pass for - * HUGETLB_TEST. - * The release of the pages currently generates event for shmem and - * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked - * for hugetlb. - * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register - * monitored area, generate pagefaults and test that signal is delivered. - * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 - * test robustness use case - we release monitored area, fork a process - * that will generate pagefaults and verify signal is generated. - * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal - * feature. Using monitor thread, verify no userfault events are generated. - */ -static int faulting_process(int signal_test) -{ - unsigned long nr; - unsigned long long count; - unsigned long split_nr_pages; - unsigned long lastnr; - struct sigaction act; - volatile unsigned long signalled = 0; - - split_nr_pages = (nr_pages + 1) / 2; - - if (signal_test) { - sigbuf = &jbuf; - memset(&act, 0, sizeof(act)); - act.sa_sigaction = sighndl; - act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) - err("sigaction"); - lastnr = (unsigned long)-1; - } - - for (nr = 0; nr < split_nr_pages; nr++) { - volatile int steps = 1; - unsigned long offset = nr * page_size; - - if (signal_test) { - if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) - err("Signal repeated"); - - lastnr = nr; - if (signal_test == 1) { - if (steps == 1) { - /* This is a MISSING request */ - steps++; - if (copy_page(uffd, offset)) - signalled++; - } else { - /* This is a WP request */ - assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + - offset, - page_size, false); - } - } else { - signalled++; - continue; - } - } - } - - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - if (signal_test) - return signalled != split_nr_pages; - - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - err("mremap"); - /* Reset area_src since we just clobbered it */ - area_src = NULL; - - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - } - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - uffd_test_ops->release_pages(area_dst); - - for (nr = 0; nr < nr_pages; nr++) - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - err("nr %lu is not zero", nr); - - return 0; -} - -static void retry_uffdio_zeropage(int ufd, - struct uffdio_zeropage *uffdio_zeropage, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, - uffdio_zeropage->range.len, - offset); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } else { - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } -} - -static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_zeropage uffdio_zeropage; - int ret; - bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); - __s64 res; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu", offset); - uffdio_zeropage.range.start = (unsigned long) area_dst + offset; - uffdio_zeropage.range.len = page_size; - uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); - res = uffdio_zeropage.zeropage; - if (ret) { - /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) - err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); - else if (res != -EINVAL) - err("UFFDIO_ZEROPAGE not -EINVAL"); - } else if (has_zeropage) { - if (res != page_size) { - err("UFFDIO_ZEROPAGE unexpected size"); - } else { - if (test_uffdio_zeropage_eexist && retry) { - test_uffdio_zeropage_eexist = false; - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - } - return 1; - } - } else - err("UFFDIO_ZEROPAGE succeeded"); - - return 0; -} - -static int uffdio_zeropage(int ufd, unsigned long offset) -{ - return __uffdio_zeropage(ufd, offset, false); -} - -/* exercise UFFDIO_ZEROPAGE */ -static int userfaultfd_zeropage_test(void) -{ - struct uffdio_register uffdio_register; - - printf("testing UFFDIO_ZEROPAGE: "); - fflush(stdout); - - uffd_test_ctx_init(0); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (uffdio_zeropage(uffd, 0)) - if (my_bcmp(area_dst, zeropage, page_size)) - err("zeropage is not zero"); - - printf("done.\n"); - return 0; -} - -static int userfaultfd_events_test(void) -{ - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing events (fork, remap, remove): "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_REMOVE; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(0)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - return stats.missing_faults != nr_pages; -} - -static int userfaultfd_sig_test(void) -{ - struct uffdio_register uffdio_register; - unsigned long userfaults; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing signal delivery: "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (faulting_process(1)) - err("faulting process failed"); - - uffd_test_ops->release_pages(area_dst); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(2)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, (void **)&userfaults)) - return 1; - - printf("done.\n"); - if (userfaults) - err("Signal test failed, userfaults: %ld", userfaults); - - return userfaults != 0; -} - -void check_memory_contents(char *p) -{ - unsigned long i; - uint8_t expected_byte; - void *expected_page; - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (i = 0; i < nr_pages; ++i) { - expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, p + (i * page_size), page_size)) - err("unexpected page contents after minor fault"); - } - - free(expected_page); -} - -static int userfaultfd_minor_test(void) -{ - unsigned long p; - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - char c; - struct uffd_stats stats = { 0 }; - - if (!test_uffdio_minor) - return 0; - - printf("testing minor faults: "); - fflush(stdout); - - uffd_test_ctx_init(uffd_minor_feature()); - - uffdio_register.range.start = (unsigned long)area_dst_alias; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - /* - * After registering with UFFD, populate the non-UFFD-registered side of - * the shared mapping. This should *not* trigger any UFFD minor faults. - */ - for (p = 0; p < nr_pages; ++p) { - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); - } - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - /* - * Read each of the pages back using the UFFD-registered mapping. We - * expect that the first time we touch a page, it will result in a minor - * fault. uffd_poll_thread will resolve the fault by bit-flipping the - * page's contents, and then issuing a CONTINUE ioctl. - */ - check_memory_contents(area_dst_alias); - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - if (test_collapse) { - printf("testing collapse of uffd memory into PMD-mapped THPs:"); - if (madvise(area_dst_alias, nr_pages * page_size, - MADV_COLLAPSE)) - err("madvise(MADV_COLLAPSE)"); - - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / - hpage_size); - /* - * This won't cause uffd-fault - it purely just makes sure there - * was no corruption. - */ - check_memory_contents(area_dst_alias); - printf(" done.\n"); - } - - return stats.missing_faults != 0 || stats.minor_faults != nr_pages; -} - -#define BIT_ULL(nr) (1ULL << (nr)) -#define PM_SOFT_DIRTY BIT_ULL(55) -#define PM_MMAP_EXCLUSIVE BIT_ULL(56) -#define PM_UFFD_WP BIT_ULL(57) -#define PM_FILE BIT_ULL(61) -#define PM_SWAP BIT_ULL(62) -#define PM_PRESENT BIT_ULL(63) - -static int pagemap_open(void) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - - if (fd < 0) - err("open pagemap"); - - return fd; -} - -static uint64_t pagemap_read_vaddr(int fd, void *vaddr) -{ - uint64_t value; - int ret; - - ret = pread(fd, &value, sizeof(uint64_t), - ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); - if (ret != sizeof(uint64_t)) - err("pread() on pagemap failed"); - - return value; -} - -/* This macro let __LINE__ works in err() */ -#define pagemap_check_wp(value, wp) do { \ - if (!!(value & PM_UFFD_WP) != wp) \ - err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ - } while (0) - -static int pagemap_test_fork(bool present) -{ - pid_t child = fork(); - uint64_t value; - int fd, result; - - if (!child) { - /* Open the pagemap fd of the child itself */ - fd = pagemap_open(); - value = pagemap_read_vaddr(fd, area_dst); - /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK - */ - pagemap_check_wp(value, false); - /* Succeed */ - exit(0); - } - waitpid(child, &result, 0); - return result; -} - -static void userfaultfd_pagemap_test(unsigned int test_pgsize) -{ - struct uffdio_register uffdio_register; - int pagemap_fd; - uint64_t value; - - /* Pagemap tests uffd-wp only */ - if (!test_uffdio_wp) - return; - - /* Not enough memory to test this page size */ - if (test_pgsize > nr_pages * page_size) - return; - - printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); - /* Flush so it doesn't flush twice in parent/child later */ - fflush(stdout); - - uffd_test_ctx_init(0); - - if (test_pgsize > page_size) { - /* This is a thp test */ - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) - err("madvise(MADV_HUGEPAGE) failed"); - } else if (test_pgsize == page_size) { - /* This is normal page test; force no thp */ - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) - err("madvise(MADV_NOHUGEPAGE) failed"); - } - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failed"); - - pagemap_fd = pagemap_open(); - - /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); - - if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); - - /* Uffd-wp should persist even swapped out */ - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); - - /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - close(pagemap_fd); - printf("done\n"); -} - -static int userfaultfd_stress(void) -{ - void *area; - unsigned long nr; - struct uffdio_register uffdio_register; - struct uffd_stats uffd_stats[nr_cpus]; - - uffd_test_ctx_init(0); - - if (posix_memalign(&area, page_size, page_size)) - err("out of memory"); - zeropage = area; - bzero(zeropage, page_size); - - pthread_mutex_lock(&uffd_read_mutex); - - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, 16*1024*1024); - - while (bounces--) { - printf("bounces: %d, mode:", bounces); - if (bounces & BOUNCE_RANDOM) - printf(" rnd"); - if (bounces & BOUNCE_RACINGFAULTS) - printf(" racing"); - if (bounces & BOUNCE_VERIFY) - printf(" ver"); - if (bounces & BOUNCE_POLL) - printf(" poll"); - else - printf(" read"); - printf(", "); - fflush(stdout); - - if (bounces & BOUNCE_POLL) - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - else - fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); - - /* register */ - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) - area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure alias"); - } - - /* - * The madvise done previously isn't enough: some - * uffd_thread could have read userfaults (one of - * those already resolved by the background thread) - * and it may be in the process of calling - * UFFDIO_COPY. UFFDIO_COPY will read the zapped - * area_src and it would map a zero page in it (of - * course such a UFFDIO_COPY is perfectly safe as it'd - * return -EEXIST). The problem comes at the next - * bounce though: that racing UFFDIO_COPY would - * generate zeropages in the area_src, so invalidating - * the previous MADV_DONTNEED. Without this additional - * MADV_DONTNEED those zeropages leftovers in the - * area_src would lead to -EEXIST failure during the - * next bounce, effectively leaving a zeropage in the - * area_dst. - * - * Try to comment this out madvise to see the memory - * corruption being caught pretty quick. - * - * khugepaged is also inhibited to collapse THP after - * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's - * required to MADV_DONTNEED here. - */ - uffd_test_ops->release_pages(area_dst); - - uffd_stats_reset(uffd_stats, nr_cpus); - - /* bounce pass */ - if (stress(uffd_stats)) - return 1; - - /* Clear all the write protections if there is any */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst, - nr_pages * page_size, false); - - /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) - err("unregister failure"); - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) area_dst; - if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) - err("unregister failure alias"); - } - - /* verification */ - if (bounces & BOUNCE_VERIFY) - for (nr = 0; nr < nr_pages; nr++) - if (*area_count(area_dst, nr) != count_verify[nr]) - err("error area_count %llu %llu %lu\n", - *area_count(area_src, nr), - count_verify[nr], nr); - - /* prepare next bounce */ - swap(area_src, area_dst); - - swap(area_src_alias, area_dst_alias); - - uffd_stats_report(uffd_stats, nr_cpus); - } - - if (test_type == TEST_ANON) { - /* - * shmem/hugetlb won't be able to run since they have different - * behavior on fork() (file-backed memory normally drops ptes - * directly when fork), meanwhile the pagemap test will verify - * pgtable entry of fork()ed child. - */ - userfaultfd_pagemap_test(page_size); - /* - * Hard-code for x86_64 for now for 2M THP, as x86_64 is - * currently the only one that supports uffd-wp - */ - userfaultfd_pagemap_test(page_size * 512); - } - - return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test() || userfaultfd_minor_test(); -} - -/* - * Copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static void set_test_type(const char *type) -{ - if (!strcmp(type, "anon")) { - test_type = TEST_ANON; - uffd_test_ops = &anon_uffd_test_ops; - } else if (!strcmp(type, "hugetlb")) { - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - } else if (!strcmp(type, "hugetlb_shared")) { - map_shared = true; - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - /* Minor faults require shared hugetlb; only enable here. */ - test_uffdio_minor = true; - } else if (!strcmp(type, "shmem")) { - map_shared = true; - test_type = TEST_SHMEM; - uffd_test_ops = &shmem_uffd_test_ops; - test_uffdio_minor = true; - } -} - -static void parse_test_type_arg(const char *raw_type) -{ - char *buf = strdup(raw_type); - uint64_t features = UFFD_API_FEATURES; - - while (buf) { - const char *token = strsep(&buf, ":"); - - if (!test_type) - set_test_type(token); - else if (!strcmp(token, "dev")) - test_dev_userfaultfd = true; - else if (!strcmp(token, "syscall")) - test_dev_userfaultfd = false; - else if (!strcmp(token, "collapse")) - test_collapse = true; - else - err("unrecognized test mod '%s'", token); - } - - if (!test_type) - err("failed to parse test type argument: '%s'", raw_type); - - if (test_collapse && test_type != TEST_SHMEM) - err("Unsupported test: %s", raw_type); - - if (test_type == TEST_HUGETLB) - page_size = hpage_size; - else - page_size = sysconf(_SC_PAGE_SIZE); - - if (!page_size) - err("Unable to determine page size"); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - err("Impossible to run this test"); - - /* - * Whether we can test certain features depends not just on test type, - * but also on whether or not this particular kernel supports the - * feature. - */ - - userfaultfd_open(&features); - - test_uffdio_wp = test_uffdio_wp && - (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - test_uffdio_minor = test_uffdio_minor && - (features & uffd_minor_feature()); - - close(uffd); - uffd = -1; -} - -static void sigalrm(int sig) -{ - if (sig != SIGALRM) - abort(); - test_uffdio_copy_eexist = true; - test_uffdio_zeropage_eexist = true; - alarm(ALARM_INTERVAL_SECS); -} - -int main(int argc, char **argv) -{ - size_t bytes; - - if (argc < 4) - usage(); - - if (signal(SIGALRM, sigalrm) == SIG_ERR) - err("failed to arm SIGALRM"); - alarm(ALARM_INTERVAL_SECS); - - hpage_size = default_huge_page_size(); - parse_test_type_arg(argv[1]); - bytes = atol(argv[2]) * 1024 * 1024; - - if (test_collapse && bytes & (hpage_size - 1)) - err("MiB must be multiple of %lu if :collapse mod set", - hpage_size >> 20); - - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - - if (test_collapse) { - /* nr_cpus must divide (bytes / page_size), otherwise, - * area allocations of (nr_pages * paze_size) won't be a - * multiple of hpage_size, even if bytes is a multiple of - * hpage_size. - * - * This means that nr_cpus must divide (N * (2 << (H-P)) - * where: - * bytes = hpage_size * N - * hpage_size = 2 << H - * page_size = 2 << P - * - * And we want to chose nr_cpus to be the largest value - * satisfying this constraint, not larger than the number - * of online CPUs. Unfortunately, prime factorization of - * N and nr_cpus may be arbitrary, so have to search for it. - * Instead, just use the highest power of 2 dividing both - * nr_cpus and (bytes / page_size). - */ - int x = factor_of_2(nr_cpus); - int y = factor_of_2(bytes / page_size); - - nr_cpus = x < y ? x : y; - } - nr_pages_per_cpu = bytes / page_size / nr_cpus; - if (!nr_pages_per_cpu) { - _err("invalid MiB"); - usage(); - } - - bounces = atoi(argv[3]); - if (bounces <= 0) { - _err("invalid bounces"); - usage(); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - - if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { - unsigned int memfd_flags = 0; - - if (test_type == TEST_HUGETLB) - memfd_flags = MFD_HUGETLB; - mem_fd = memfd_create(argv[0], memfd_flags); - if (mem_fd < 0) - err("memfd_create"); - if (ftruncate(mem_fd, nr_pages * page_size * 2)) - err("ftruncate"); - if (fallocate(mem_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, - nr_pages * page_size * 2)) - err("fallocate"); - } - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ |