summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2022-12-08 17:55:04 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2024-07-19 20:22:12 +0200
commit9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1 (patch)
tree147361bfcd52485ac62867a06d223712d4e79663 /tools
parent8a18fda0febb7790de20ec1c3b4522ce026be1c6 (diff)
downloadlwn-9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1.tar.gz
lwn-9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1.zip
mm: add MAP_DROPPABLE for designating always lazily freeable mappings
The vDSO getrandom() implementation works with a buffer allocated with a new system call that has certain requirements: - It shouldn't be written to core dumps. * Easy: VM_DONTDUMP. - It should be zeroed on fork. * Easy: VM_WIPEONFORK. - It shouldn't be written to swap. * Uh-oh: mlock is rlimited. * Uh-oh: mlock isn't inherited by forks. - It shouldn't reserve actual memory, but it also shouldn't crash when page faulting in memory if none is available * Uh-oh: VM_NORESERVE means segfaults. It turns out that the vDSO getrandom() function has three really nice characteristics that we can exploit to solve this problem: 1) Due to being wiped during fork(), the vDSO code is already robust to having the contents of the pages it reads zeroed out midway through the function's execution. 2) In the absolute worst case of whatever contingency we're coding for, we have the option to fallback to the getrandom() syscall, and everything is fine. 3) The buffers the function uses are only ever useful for a maximum of 60 seconds -- a sort of cache, rather than a long term allocation. These characteristics mean that we can introduce VM_DROPPABLE, which has the following semantics: a) It never is written out to swap. b) Under memory pressure, mm can just drop the pages (so that they're zero when read back again). c) It is inherited by fork. d) It doesn't count against the mlock budget, since nothing is locked. e) If there's not enough memory to service a page fault, it's not fatal, and no signal is sent. This way, allocations used by vDSO getrandom() can use: VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE And there will be no problem with OOMing, crashing on overcommitment, using memory when not in use, not wiping on fork(), coredumps, or writing out to swap. In order to let vDSO getrandom() use this, expose these via mmap(2) as MAP_DROPPABLE. Note that this involves removing the MADV_FREE special case from sort_folio(), which according to Yu Zhao is unnecessary and will simply result in an extra call to shrink_folio_list() in the worst case. The chunk removed reenables the swapbacked flag, which we don't want for VM_DROPPABLE, and we can't conditionalize it here because there isn't a vma reference available. Finally, the provided self test ensures that this is working as desired. Cc: linux-mm@kvack.org Acked-by: David Hildenbrand <david@redhat.com> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'tools')
-rw-r--r--tools/include/uapi/linux/mman.h1
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile1
-rw-r--r--tools/testing/selftests/mm/droppable.c53
4 files changed, 56 insertions, 0 deletions
diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h
index a246e11988d5..e89d00528f2f 100644
--- a/tools/include/uapi/linux/mman.h
+++ b/tools/include/uapi/linux/mman.h
@@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
+#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
/*
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 0b9ab987601c..a8beeb43c2b5 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -49,3 +49,4 @@ hugetlb_fault_after_madv
hugetlb_madv_vs_map
mseal_test
seal_elf
+droppable
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 3b49bc3d0a3b..e3e5740e13e1 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -73,6 +73,7 @@ TEST_GEN_FILES += ksm_functional_tests
TEST_GEN_FILES += mdwe_test
TEST_GEN_FILES += hugetlb_fault_after_madv
TEST_GEN_FILES += hugetlb_madv_vs_map
+TEST_GEN_FILES += droppable
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c
new file mode 100644
index 000000000000..f3d9ecf96890
--- /dev/null
+++ b/tools/testing/selftests/mm/droppable.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+
+#include "../kselftest.h"
+
+int main(int argc, char *argv[])
+{
+ size_t alloc_size = 134217728;
+ size_t page_size = getpagesize();
+ void *alloc;
+ pid_t child;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
+ assert(alloc != MAP_FAILED);
+ memset(alloc, 'A', alloc_size);
+ for (size_t i = 0; i < alloc_size; i += page_size)
+ assert(*(uint8_t *)(alloc + i));
+
+ child = fork();
+ assert(child >= 0);
+ if (!child) {
+ for (;;)
+ *(char *)malloc(page_size) = 'B';
+ }
+
+ for (bool done = false; !done;) {
+ for (size_t i = 0; i < alloc_size; i += page_size) {
+ if (!*(uint8_t *)(alloc + i)) {
+ done = true;
+ break;
+ }
+ }
+ }
+ kill(child, SIGTERM);
+
+ ksft_test_result_pass("MAP_DROPPABLE: PASS\n");
+ exit(KSFT_PASS);
+}