From d92725256b4f22d084b813b37ddc394da79aacab Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 30 May 2022 14:34:50 -0400 Subject: mm: avoid unnecessary page fault retires on shared memory types I observed that for each of the shared file-backed page faults, we're very likely to retry one more time for the 1st write fault upon no page. It's because we'll need to release the mmap lock for dirty rate limit purpose with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()). Then after that throttling we return VM_FAULT_RETRY. We did that probably because VM_FAULT_RETRY is the only way we can return to the fault handler at that time telling it we've released the mmap lock. However that's not ideal because it's very likely the fault does not need to be retried at all since the pgtable was well installed before the throttling, so the next continuous fault (including taking mmap read lock, walk the pgtable, etc.) could be in most cases unnecessary. It's not only slowing down page faults for shared file-backed, but also add more mmap lock contention which is in most cases not needed at all. To observe this, one could try to write to some shmem page and look at "pgfault" value in /proc/vmstat, then we should expect 2 counts for each shmem write simply because we retried, and vm event "pgfault" will capture that. To make it more efficient, add a new VM_FAULT_COMPLETED return code just to show that we've completed the whole fault and released the lock. It's also a hint that we should very possibly not need another fault immediately on this page because we've just completed it. This patch provides a ~12% perf boost on my aarch64 test VM with a simple program sequentially dirtying 400MB shmem file being mmap()ed and these are the time it needs: Before: 650.980 ms (+-1.94%) After: 569.396 ms (+-1.38%) I believe it could help more than that. We need some special care on GUP and the s390 pgfault handler (for gmap code before returning from pgfault), the rest changes in the page fault handlers should be relatively straightforward. Another thing to mention is that mm_account_fault() does take this new fault as a generic fault to be accounted, unlike VM_FAULT_RETRY. I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping them as-is. Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Vineet Gupta Acked-by: Guo Ren Acked-by: Max Filippov Acked-by: Christian Borntraeger Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Reviewed-by: Alistair Popple Reviewed-by: Ingo Molnar Acked-by: Russell King (Oracle) [arm part] Acked-by: Heiko Carstens Cc: Vasily Gorbik Cc: Stafford Horne Cc: David S. Miller Cc: Johannes Berg Cc: Brian Cain Cc: Richard Henderson Cc: Richard Weinberger Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Janosch Frank Cc: Albert Ou Cc: Anton Ivanov Cc: Dave Hansen Cc: Borislav Petkov Cc: Sven Schnelle Cc: Andrea Arcangeli Cc: James Bottomley Cc: Al Viro Cc: Alexander Gordeev Cc: Jonas Bonn Cc: Will Deacon Cc: Vlastimil Babka Cc: Michal Simek Cc: Matt Turner Cc: Paul Mackerras Cc: David Hildenbrand Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Stefan Kristiansson Cc: Paul Walmsley Cc: Ivan Kokshaysky Cc: Chris Zankel Cc: Hugh Dickins Cc: Dinh Nguyen Cc: Rich Felker Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Thomas Bogendoerfer Cc: Helge Deller Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c29ab4c0cd5c..6b961a29bf26 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs * fsync() to complete (for synchronous page faults * in DAX) + * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -746,6 +747,7 @@ enum vm_fault_reason { VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, + VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; -- cgit v1.2.3 From bcc728eb4f446073e0160671d7d0059a4e9aa300 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 31 May 2022 10:04:21 +0800 Subject: mm/damon: remove obsolete comments of kdamond_stop Since commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") delete kdamond_stop and change to use kthread stop mechanism, these obsolete comments should be removed accordingly. Link: https://lkml.kernel.org/r/20220531020421.46849-1-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7c62da31ce4b..2765c7d99beb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -397,7 +397,6 @@ struct damon_callback { * detail. * * @kdamond: Kernel thread who does the monitoring. - * @kdamond_stop: Notifies whether kdamond should stop. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * * For each monitoring context, one kernel thread for the monitoring is @@ -406,14 +405,14 @@ struct damon_callback { * Once started, the monitoring thread runs until explicitly required to be * terminated or every monitoring target is invalid. The validity of the * targets is checked via the &damon_operations.target_valid of @ops. The - * termination can also be explicitly requested by writing non-zero to - * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. - * Therefore, users can know whether the monitoring is ongoing or terminated by - * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from - * outside of the monitoring thread must be protected by @kdamond_lock. - * - * Note that the monitoring thread protects only @kdamond and @kdamond_stop via - * @kdamond_lock. Accesses to other fields must be protected by themselves. + * termination can also be explicitly requested by calling damon_stop(). + * The thread sets @kdamond to NULL when it terminates. Therefore, users can + * know whether the monitoring is ongoing or terminated by reading @kdamond. + * Reads and writes to @kdamond from outside of the monitoring thread must + * be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond via @kdamond_lock. + * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. -- cgit v1.2.3 From 9384d79249d04b03572abb7e551a35d99c9268c0 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 6 Jun 2022 16:15:33 +0200 Subject: mm/highmem: delete memmove_page() Matthew Wilcox reported that, while he was looking at memmove_page(), he realized that it can't actually work. The reasons are hidden in its implementation, which makes use of memmove() on logical addresses provided by kmap_local_page(). memmove() does the wrong thing when it tests "if (dest <= src)". Therefore, delete memmove_page(). No need to change any other code because we have no call sites of memmove_page() across the whole kernel. Link: https://lkml.kernel.org/r/20220606141533.555-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reported-by: Matthew Wilcox Reviewed-by: Baoquan He Reviewed-by: Ira Weiny Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/highmem.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3af34de54330..fee9835e3793 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -336,19 +336,6 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off, kunmap_local(dst); } -static inline void memmove_page(struct page *dst_page, size_t dst_off, - struct page *src_page, size_t src_off, - size_t len) -{ - char *dst = kmap_local_page(dst_page); - char *src = kmap_local_page(src_page); - - VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); - memmove(dst + dst_off, src + src_off, len); - kunmap_local(src); - kunmap_local(dst); -} - static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { -- cgit v1.2.3 From c200d90049dbe08fa8b016f74b713fddefca0479 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:48 +0800 Subject: mm: kmemleak: remove kmemleak_not_leak_phys() and the min_count argument to kmemleak_alloc_phys() Patch series "mm: kmemleak: store objects allocated with physical address separately and check when scan", v4. The kmemleak_*_phys() interface uses "min_low_pfn" and "max_low_pfn" to check address. But on some architectures, kmemleak_*_phys() is called before those two variables initialized. The following steps will be taken: 1) Add OBJECT_PHYS flag and rbtree for the objects allocated with physical address 2) Store physical address in objects if allocated with OBJECT_PHYS 3) Check the boundary when scan instead of in kmemleak_*_phys() This patch set will solve: https://lore.kernel.org/r/20220527032504.30341-1-yee.lee@mediatek.com https://lore.kernel.org/r/9dd08bb5-f39e-53d8-f88d-bec598a08c93@gmail.com v3: https://lore.kernel.org/r/20220609124950.1694394-1-patrick.wang.shcn@gmail.com v2: https://lore.kernel.org/r/20220603035415.1243913-1-patrick.wang.shcn@gmail.com v1: https://lore.kernel.org/r/20220531150823.1004101-1-patrick.wang.shcn@gmail.com This patch (of 4): Remove the unused kmemleak_not_leak_phys() function. And remove the min_count argument to kmemleak_alloc_phys() function, assume it's 0. Link: https://lkml.kernel.org/r/20220611035551.1823303-1-patrick.wang.shcn@gmail.com Link: https://lkml.kernel.org/r/20220611035551.1823303-2-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- Documentation/dev-tools/kmemleak.rst | 1 - drivers/of/fdt.c | 2 +- include/linux/kmemleak.h | 8 ++------ mm/kmemleak.c | 20 +++----------------- mm/memblock.c | 14 +++++++------- tools/testing/memblock/linux/kmemleak.h | 2 +- 6 files changed, 14 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index 1c935f41cd3a..5483fd39ef29 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -174,7 +174,6 @@ mapping: - ``kmemleak_alloc_phys`` - ``kmemleak_free_part_phys`` -- ``kmemleak_not_leak_phys`` - ``kmemleak_ignore_phys`` Dealing with false positives/negatives diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index a8f5b6532165..2c677e84c3f5 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -529,7 +529,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); if (!nomap) - kmemleak_alloc_phys(base, size, 0, 0); + kmemleak_alloc_phys(base, size, 0); } else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 34684b2026ab..6a3cd1bf4680 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; -extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, +extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) __ref; extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref; -extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, @@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr) { } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) { } -static inline void kmemleak_not_leak_phys(phys_addr_t phys) -{ -} static inline void kmemleak_ignore_phys(phys_addr_t phys) { } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a182f5ddaf68..156eafafa182 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1125,15 +1125,13 @@ EXPORT_SYMBOL(kmemleak_no_scan); * address argument * @phys: physical address of the object * @size: size of the object - * @min_count: minimum number of references to this object. - * See kmemleak_alloc() * @gfp: kmalloc() flags used for kmemleak internal memory allocations */ -void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, - gfp_t gfp) +void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_alloc(__va(phys), size, min_count, gfp); + /* assume min_count 0 */ + kmemleak_alloc(__va(phys), size, 0, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1151,18 +1149,6 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) } EXPORT_SYMBOL(kmemleak_free_part_phys); -/** - * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical - * address argument - * @phys: physical address of the object - */ -void __ref kmemleak_not_leak_phys(phys_addr_t phys) -{ - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_not_leak(__va(phys)); -} -EXPORT_SYMBOL(kmemleak_not_leak_phys); - /** * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical * address argument diff --git a/mm/memblock.c b/mm/memblock.c index e4f03a6e8e56..749abd2685c4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1345,8 +1345,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * from the regions with mirroring enabled and then retried from any * memory region. * - * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for - * allocated boot memory block, so that it is never reported as leaks. + * In addition, function using kmemleak_alloc_phys for allocated boot + * memory block, it is never reported as leaks. * * Return: * Physical address of allocated memory block on success, %0 on failure. @@ -1398,12 +1398,12 @@ done: */ if (end != MEMBLOCK_ALLOC_NOLEAKTRACE) /* - * The min_count is set to 0 so that memblock allocated - * blocks are never reported as leaks. This is because many - * of these blocks are only referred via the physical - * address which is not looked up by kmemleak. + * Memblock allocated blocks are never reported as + * leaks. This is because many of these blocks are + * only referred via the physical address which is + * not looked up by kmemleak. */ - kmemleak_alloc_phys(found, size, 0, 0); + kmemleak_alloc_phys(found, size, 0); return found; } diff --git a/tools/testing/memblock/linux/kmemleak.h b/tools/testing/memblock/linux/kmemleak.h index 462f8c5e8aa0..5fed13bb9ec4 100644 --- a/tools/testing/memblock/linux/kmemleak.h +++ b/tools/testing/memblock/linux/kmemleak.h @@ -7,7 +7,7 @@ static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } -- cgit v1.2.3 From fc4db90fe71e640e3fe88df346f7cf653b75315d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Jun 2022 11:03:10 -0700 Subject: mm: kmem: make mem_cgroup_from_obj() vmalloc()-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently mem_cgroup_from_obj() is not working properly with objects allocated using vmalloc(). It creates problems in some cases, when it's called for static objects belonging to modules or generally allocated using vmalloc(). This patch makes mem_cgroup_from_obj() safe to be called on objects allocated using vmalloc(). It also introduces mem_cgroup_from_slab_obj(), which is a faster version to use in places when we know the object is either a slab object or a generic slab page (e.g. when adding an object to a lru list). Link: https://lkml.kernel.org/r/20220610180310.1725111-1-roman.gushchin@linux.dev Suggested-by: Kefeng Wang Signed-off-by: Roman Gushchin Tested-by: Linux Kernel Functional Testing Acked-by: Shakeel Butt Tested-by: Vasily Averin Acked-by: Michal Hocko Acked-by: Muchun Song Cc: Johannes Weiner Cc: Naresh Kamboju Cc: Qian Cai Cc: Kefeng Wang Cc: David S. Miller Cc: Eric Dumazet Cc: Florian Westphal Cc: Jakub Kicinski Cc: Michal Koutný Cc: Paolo Abeni Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++++ mm/list_lru.c | 2 +- mm/memcontrol.c | 71 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 57 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ecead1042b9..3ce96ce5fe3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1740,6 +1740,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_obj(void *p); +struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) @@ -1801,6 +1802,11 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +{ + return NULL; +} + static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { diff --git a/mm/list_lru.c b/mm/list_lru.c index ba76428ceece..a05e5bef3b40 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -71,7 +71,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, if (!list_lru_memcg_aware(lru)) goto out; - memcg = mem_cgroup_from_obj(ptr); + memcg = mem_cgroup_from_slab_obj(ptr); if (!memcg) goto out; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28c1532cc91f..c1ae9b3f8d35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -783,7 +783,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) struct lruvec *lruvec; rcu_read_lock(); - memcg = mem_cgroup_from_obj(p); + memcg = mem_cgroup_from_slab_obj(p); /* * Untracked pages have no memcg, no lruvec. Update only the @@ -2841,27 +2841,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, return 0; } -/* - * Returns a pointer to the memory cgroup to which the kernel object is charged. - * - * A passed kernel object can be a slab object or a generic kernel page, so - * different mechanisms for getting the memory cgroup pointer should be used. - * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller - * can not know for sure how the kernel object is implemented. - * mem_cgroup_from_obj() can be safely used in such cases. - * - * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), - * cgroup_mutex, etc. - */ -struct mem_cgroup *mem_cgroup_from_obj(void *p) +static __always_inline +struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) { - struct folio *folio; - - if (mem_cgroup_disabled()) - return NULL; - - folio = virt_to_folio(p); - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in @@ -2894,6 +2876,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return page_memcg_check(folio_page(folio, 0)); } +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * + * A passed kernel object can be a slab object, vmalloc object or a generic + * kernel page, so different mechanisms for getting the memory cgroup pointer + * should be used. + * + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_obj(void *p) +{ + struct folio *folio; + + if (mem_cgroup_disabled()) + return NULL; + + if (unlikely(is_vmalloc_addr(p))) + folio = page_folio(vmalloc_to_page(p)); + else + folio = virt_to_folio(p); + + return mem_cgroup_from_obj_folio(folio, p); +} + +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, + * allocated using vmalloc(). + * + * A passed kernel object must be a slab object or a generic kernel page. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +{ + if (mem_cgroup_disabled()) + return NULL; + + return mem_cgroup_from_obj_folio(virt_to_folio(p), p); +} + static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { struct obj_cgroup *objcg = NULL; -- cgit v1.2.3 From 1d0403d20f6c281cb3d14c5f1db5317caeec48e9 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 3 Jun 2022 07:19:43 +0300 Subject: net: set proper memcg for net_init hooks allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __register_pernet_operations() executes init hook of registered pernet_operation structure in all existing net namespaces. Typically, these hooks are called by a process associated with the specified net namespace, and all __GFP_ACCOUNT marked allocation are accounted for corresponding container/memcg. However __register_pernet_operations() calls the hooks in the same context, and as a result all marked allocations are accounted to one memcg for all processed net namespaces. This patch adjusts active memcg for each net namespace and helps to account memory allocated inside ops_init() into the proper memcg. Link: https://lkml.kernel.org/r/f9394752-e272-9bf9-645f-a18c56d1c4ec@openvz.org Signed-off-by: Vasily Averin Acked-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Michal Koutný Cc: Vlastimil Babka Cc: Michal Hocko Cc: Florian Westphal Cc: David S. Miller Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Eric Dumazet Cc: Johannes Weiner Cc: Kefeng Wang Cc: Linux Kernel Functional Testing Cc: Muchun Song Cc: Naresh Kamboju Cc: Qian Cai Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 47 +++++++++++++++++++++++++++++++++++++++++++++- net/core/net_namespace.c | 7 +++++++ 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3ce96ce5fe3e..04f2f33607e9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1756,6 +1756,42 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_unlock(); } +/** + * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object. + * @p: pointer to object from which memcg should be extracted. It can be NULL. + * + * Retrieves the memory group into which the memory of the pointed kernel + * object is accounted. If memcg is found, its reference is taken. + * If a passed kernel object is uncharged, or if proper memcg cannot be found, + * as well as if mem_cgroup is disabled, NULL is returned. + * + * Return: valid memcg pointer with taken reference or NULL. + */ +static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + do { + memcg = mem_cgroup_from_obj(p); + } while (memcg && !css_tryget(&memcg->css)); + rcu_read_unlock(); + return memcg; +} + +/** + * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup. + * @memcg: pointer to a valid memory cgroup or NULL. + * + * If passed argument is not NULL, returns it without any additional checks + * and changes. Otherwise, root_mem_cgroup is returned. + * + * NOTE: root_mem_cgroup can be NULL during early boot. + */ +static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) +{ + return memcg ? memcg : root_mem_cgroup; +} #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1799,7 +1835,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) { - return NULL; + return NULL; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) @@ -1812,6 +1848,15 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } +static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) +{ + return NULL; +} + +static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) +{ + return NULL; +} #endif /* CONFIG_MEMCG_KMEM */ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0ec2f5906a27..6b9f19122ec1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1143,7 +1144,13 @@ static int __register_pernet_operations(struct list_head *list, * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { + struct mem_cgroup *old, *memcg; + + memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net)); + old = set_active_memcg(memcg); error = ops_init(ops, net); + set_active_memcg(old); + mem_cgroup_put(memcg); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); -- cgit v1.2.3 From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 27 Jun 2022 09:00:26 +0300 Subject: docs: rename Documentation/vm to Documentation/mm so it will be consistent with code mm directory and with Documentation/admin-guide/mm and won't be confused with virtual machines. Signed-off-by: Mike Rapoport Suggested-by: Matthew Wilcox Tested-by: Ira Weiny Acked-by: Jonathan Corbet Acked-by: Wu XiangCheng --- Documentation/ABI/testing/sysfs-kernel-mm-ksm | 2 +- Documentation/ABI/testing/sysfs-kernel-slab | 4 +- Documentation/admin-guide/kernel-parameters.txt | 10 +- Documentation/admin-guide/mm/concepts.rst | 2 +- Documentation/admin-guide/mm/damon/index.rst | 2 +- Documentation/admin-guide/mm/damon/reclaim.rst | 2 +- Documentation/admin-guide/mm/damon/usage.rst | 8 +- Documentation/admin-guide/sysctl/vm.rst | 2 +- Documentation/core-api/index.rst | 2 +- Documentation/filesystems/proc.rst | 2 +- Documentation/index.rst | 2 +- Documentation/mm/active_mm.rst | 91 ++++ Documentation/mm/arch_pgtable_helpers.rst | 260 +++++++++ Documentation/mm/balance.rst | 102 ++++ Documentation/mm/bootmem.rst | 5 + Documentation/mm/damon/api.rst | 20 + Documentation/mm/damon/design.rst | 176 ++++++ Documentation/mm/damon/faq.rst | 50 ++ Documentation/mm/damon/index.rst | 29 + Documentation/mm/free_page_reporting.rst | 40 ++ Documentation/mm/frontswap.rst | 266 +++++++++ Documentation/mm/highmem.rst | 167 ++++++ Documentation/mm/hmm.rst | 452 ++++++++++++++++ Documentation/mm/hugetlbfs_reserv.rst | 596 +++++++++++++++++++++ Documentation/mm/hwpoison.rst | 184 +++++++ Documentation/mm/index.rst | 68 +++ Documentation/mm/ksm.rst | 87 +++ Documentation/mm/memory-model.rst | 177 ++++++ Documentation/mm/mmu_notifier.rst | 99 ++++ Documentation/mm/numa.rst | 150 ++++++ Documentation/mm/oom.rst | 5 + Documentation/mm/overcommit-accounting.rst | 88 +++ Documentation/mm/page_allocation.rst | 5 + Documentation/mm/page_cache.rst | 5 + Documentation/mm/page_frags.rst | 45 ++ Documentation/mm/page_migration.rst | 288 ++++++++++ Documentation/mm/page_owner.rst | 196 +++++++ Documentation/mm/page_reclaim.rst | 5 + Documentation/mm/page_table_check.rst | 56 ++ Documentation/mm/page_tables.rst | 5 + Documentation/mm/physical_memory.rst | 5 + Documentation/mm/process_addrs.rst | 5 + Documentation/mm/remap_file_pages.rst | 33 ++ Documentation/mm/shmfs.rst | 5 + Documentation/mm/slab.rst | 5 + Documentation/mm/slub.rst | 452 ++++++++++++++++ Documentation/mm/split_page_table_lock.rst | 100 ++++ Documentation/mm/swap.rst | 5 + Documentation/mm/transhuge.rst | 187 +++++++ Documentation/mm/unevictable-lru.rst | 554 +++++++++++++++++++ Documentation/mm/vmalloc.rst | 5 + Documentation/mm/vmalloced-kernel-stacks.rst | 153 ++++++ Documentation/mm/vmemmap_dedup.rst | 223 ++++++++ Documentation/mm/z3fold.rst | 30 ++ Documentation/mm/zsmalloc.rst | 82 +++ .../zh_CN/admin-guide/mm/damon/index.rst | 2 +- .../zh_CN/admin-guide/mm/damon/reclaim.rst | 2 +- .../zh_CN/admin-guide/mm/damon/usage.rst | 8 +- .../translations/zh_CN/core-api/index.rst | 2 +- Documentation/translations/zh_CN/index.rst | 2 +- Documentation/translations/zh_CN/mm/active_mm.rst | 85 +++ Documentation/translations/zh_CN/mm/balance.rst | 81 +++ Documentation/translations/zh_CN/mm/damon/api.rst | 32 ++ .../translations/zh_CN/mm/damon/design.rst | 140 +++++ Documentation/translations/zh_CN/mm/damon/faq.rst | 48 ++ .../translations/zh_CN/mm/damon/index.rst | 32 ++ .../translations/zh_CN/mm/free_page_reporting.rst | 38 ++ Documentation/translations/zh_CN/mm/frontswap.rst | 196 +++++++ Documentation/translations/zh_CN/mm/highmem.rst | 128 +++++ Documentation/translations/zh_CN/mm/hmm.rst | 361 +++++++++++++ .../translations/zh_CN/mm/hugetlbfs_reserv.rst | 436 +++++++++++++++ Documentation/translations/zh_CN/mm/hwpoison.rst | 166 ++++++ Documentation/translations/zh_CN/mm/index.rst | 54 ++ Documentation/translations/zh_CN/mm/ksm.rst | 70 +++ .../translations/zh_CN/mm/memory-model.rst | 135 +++++ .../translations/zh_CN/mm/mmu_notifier.rst | 97 ++++ Documentation/translations/zh_CN/mm/numa.rst | 101 ++++ .../zh_CN/mm/overcommit-accounting.rst | 86 +++ Documentation/translations/zh_CN/mm/page_frags.rst | 38 ++ Documentation/translations/zh_CN/mm/page_owner.rst | 116 ++++ .../translations/zh_CN/mm/page_table_check.rst | 56 ++ .../translations/zh_CN/mm/remap_file_pages.rst | 32 ++ .../zh_CN/mm/split_page_table_lock.rst | 96 ++++ Documentation/translations/zh_CN/mm/z3fold.rst | 31 ++ Documentation/translations/zh_CN/mm/zsmalloc.rst | 78 +++ Documentation/translations/zh_CN/vm/active_mm.rst | 85 --- Documentation/translations/zh_CN/vm/balance.rst | 81 --- Documentation/translations/zh_CN/vm/damon/api.rst | 32 -- .../translations/zh_CN/vm/damon/design.rst | 140 ----- Documentation/translations/zh_CN/vm/damon/faq.rst | 48 -- .../translations/zh_CN/vm/damon/index.rst | 33 -- .../translations/zh_CN/vm/free_page_reporting.rst | 38 -- Documentation/translations/zh_CN/vm/frontswap.rst | 196 ------- Documentation/translations/zh_CN/vm/highmem.rst | 128 ----- Documentation/translations/zh_CN/vm/hmm.rst | 361 ------------- .../translations/zh_CN/vm/hugetlbfs_reserv.rst | 436 --------------- Documentation/translations/zh_CN/vm/hwpoison.rst | 166 ------ Documentation/translations/zh_CN/vm/index.rst | 54 -- Documentation/translations/zh_CN/vm/ksm.rst | 70 --- .../translations/zh_CN/vm/memory-model.rst | 135 ----- .../translations/zh_CN/vm/mmu_notifier.rst | 97 ---- Documentation/translations/zh_CN/vm/numa.rst | 101 ---- .../zh_CN/vm/overcommit-accounting.rst | 86 --- Documentation/translations/zh_CN/vm/page_frags.rst | 38 -- Documentation/translations/zh_CN/vm/page_owner.rst | 116 ---- .../translations/zh_CN/vm/page_table_check.rst | 56 -- .../translations/zh_CN/vm/remap_file_pages.rst | 32 -- .../zh_CN/vm/split_page_table_lock.rst | 96 ---- Documentation/translations/zh_CN/vm/z3fold.rst | 31 -- Documentation/translations/zh_CN/vm/zsmalloc.rst | 78 --- Documentation/translations/zh_TW/index.rst | 2 +- Documentation/vm/.gitignore | 3 - Documentation/vm/active_mm.rst | 91 ---- Documentation/vm/arch_pgtable_helpers.rst | 260 --------- Documentation/vm/balance.rst | 102 ---- Documentation/vm/bootmem.rst | 5 - Documentation/vm/damon/api.rst | 20 - Documentation/vm/damon/design.rst | 176 ------ Documentation/vm/damon/faq.rst | 50 -- Documentation/vm/damon/index.rst | 29 - Documentation/vm/free_page_reporting.rst | 40 -- Documentation/vm/frontswap.rst | 266 --------- Documentation/vm/highmem.rst | 167 ------ Documentation/vm/hmm.rst | 452 ---------------- Documentation/vm/hugetlbfs_reserv.rst | 596 --------------------- Documentation/vm/hwpoison.rst | 184 ------- Documentation/vm/index.rst | 68 --- Documentation/vm/ksm.rst | 87 --- Documentation/vm/memory-model.rst | 177 ------ Documentation/vm/mmu_notifier.rst | 99 ---- Documentation/vm/numa.rst | 150 ------ Documentation/vm/oom.rst | 5 - Documentation/vm/overcommit-accounting.rst | 88 --- Documentation/vm/page_allocation.rst | 5 - Documentation/vm/page_cache.rst | 5 - Documentation/vm/page_frags.rst | 45 -- Documentation/vm/page_migration.rst | 288 ---------- Documentation/vm/page_owner.rst | 196 ------- Documentation/vm/page_reclaim.rst | 5 - Documentation/vm/page_table_check.rst | 56 -- Documentation/vm/page_tables.rst | 5 - Documentation/vm/physical_memory.rst | 5 - Documentation/vm/process_addrs.rst | 5 - Documentation/vm/remap_file_pages.rst | 33 -- Documentation/vm/shmfs.rst | 5 - Documentation/vm/slab.rst | 5 - Documentation/vm/slub.rst | 452 ---------------- Documentation/vm/split_page_table_lock.rst | 100 ---- Documentation/vm/swap.rst | 5 - Documentation/vm/transhuge.rst | 187 ------- Documentation/vm/unevictable-lru.rst | 554 ------------------- Documentation/vm/vmalloc.rst | 5 - Documentation/vm/vmalloced-kernel-stacks.rst | 153 ------ Documentation/vm/vmemmap_dedup.rst | 223 -------- Documentation/vm/z3fold.rst | 30 -- Documentation/vm/zsmalloc.rst | 82 --- MAINTAINERS | 12 +- arch/loongarch/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- include/linux/hmm.h | 4 +- include/linux/memremap.h | 2 +- include/linux/mmu_notifier.h | 2 +- include/linux/sched/mm.h | 4 +- include/linux/swap.h | 2 +- mm/Kconfig | 2 +- mm/debug_vm_pgtable.c | 2 +- mm/frontswap.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 6 +- mm/hugetlb_vmemmap.c | 2 +- mm/ksm.c | 4 +- mm/mmap.c | 2 +- mm/rmap.c | 8 +- mm/sparse-vmemmap.c | 2 +- mm/util.c | 2 +- tools/vm/page_owner_sort.c | 2 +- 176 files changed, 8355 insertions(+), 8359 deletions(-) create mode 100644 Documentation/mm/active_mm.rst create mode 100644 Documentation/mm/arch_pgtable_helpers.rst create mode 100644 Documentation/mm/balance.rst create mode 100644 Documentation/mm/bootmem.rst create mode 100644 Documentation/mm/damon/api.rst create mode 100644 Documentation/mm/damon/design.rst create mode 100644 Documentation/mm/damon/faq.rst create mode 100644 Documentation/mm/damon/index.rst create mode 100644 Documentation/mm/free_page_reporting.rst create mode 100644 Documentation/mm/frontswap.rst create mode 100644 Documentation/mm/highmem.rst create mode 100644 Documentation/mm/hmm.rst create mode 100644 Documentation/mm/hugetlbfs_reserv.rst create mode 100644 Documentation/mm/hwpoison.rst create mode 100644 Documentation/mm/index.rst create mode 100644 Documentation/mm/ksm.rst create mode 100644 Documentation/mm/memory-model.rst create mode 100644 Documentation/mm/mmu_notifier.rst create mode 100644 Documentation/mm/numa.rst create mode 100644 Documentation/mm/oom.rst create mode 100644 Documentation/mm/overcommit-accounting.rst create mode 100644 Documentation/mm/page_allocation.rst create mode 100644 Documentation/mm/page_cache.rst create mode 100644 Documentation/mm/page_frags.rst create mode 100644 Documentation/mm/page_migration.rst create mode 100644 Documentation/mm/page_owner.rst create mode 100644 Documentation/mm/page_reclaim.rst create mode 100644 Documentation/mm/page_table_check.rst create mode 100644 Documentation/mm/page_tables.rst create mode 100644 Documentation/mm/physical_memory.rst create mode 100644 Documentation/mm/process_addrs.rst create mode 100644 Documentation/mm/remap_file_pages.rst create mode 100644 Documentation/mm/shmfs.rst create mode 100644 Documentation/mm/slab.rst create mode 100644 Documentation/mm/slub.rst create mode 100644 Documentation/mm/split_page_table_lock.rst create mode 100644 Documentation/mm/swap.rst create mode 100644 Documentation/mm/transhuge.rst create mode 100644 Documentation/mm/unevictable-lru.rst create mode 100644 Documentation/mm/vmalloc.rst create mode 100644 Documentation/mm/vmalloced-kernel-stacks.rst create mode 100644 Documentation/mm/vmemmap_dedup.rst create mode 100644 Documentation/mm/z3fold.rst create mode 100644 Documentation/mm/zsmalloc.rst create mode 100644 Documentation/translations/zh_CN/mm/active_mm.rst create mode 100644 Documentation/translations/zh_CN/mm/balance.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/api.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/design.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/faq.rst create mode 100644 Documentation/translations/zh_CN/mm/damon/index.rst create mode 100644 Documentation/translations/zh_CN/mm/free_page_reporting.rst create mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst create mode 100644 Documentation/translations/zh_CN/mm/highmem.rst create mode 100644 Documentation/translations/zh_CN/mm/hmm.rst create mode 100644 Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst create mode 100644 Documentation/translations/zh_CN/mm/hwpoison.rst create mode 100644 Documentation/translations/zh_CN/mm/index.rst create mode 100644 Documentation/translations/zh_CN/mm/ksm.rst create mode 100644 Documentation/translations/zh_CN/mm/memory-model.rst create mode 100644 Documentation/translations/zh_CN/mm/mmu_notifier.rst create mode 100644 Documentation/translations/zh_CN/mm/numa.rst create mode 100644 Documentation/translations/zh_CN/mm/overcommit-accounting.rst create mode 100644 Documentation/translations/zh_CN/mm/page_frags.rst create mode 100644 Documentation/translations/zh_CN/mm/page_owner.rst create mode 100644 Documentation/translations/zh_CN/mm/page_table_check.rst create mode 100644 Documentation/translations/zh_CN/mm/remap_file_pages.rst create mode 100644 Documentation/translations/zh_CN/mm/split_page_table_lock.rst create mode 100644 Documentation/translations/zh_CN/mm/z3fold.rst create mode 100644 Documentation/translations/zh_CN/mm/zsmalloc.rst delete mode 100644 Documentation/translations/zh_CN/vm/active_mm.rst delete mode 100644 Documentation/translations/zh_CN/vm/balance.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/api.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/design.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/faq.rst delete mode 100644 Documentation/translations/zh_CN/vm/damon/index.rst delete mode 100644 Documentation/translations/zh_CN/vm/free_page_reporting.rst delete mode 100644 Documentation/translations/zh_CN/vm/frontswap.rst delete mode 100644 Documentation/translations/zh_CN/vm/highmem.rst delete mode 100644 Documentation/translations/zh_CN/vm/hmm.rst delete mode 100644 Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/translations/zh_CN/vm/hwpoison.rst delete mode 100644 Documentation/translations/zh_CN/vm/index.rst delete mode 100644 Documentation/translations/zh_CN/vm/ksm.rst delete mode 100644 Documentation/translations/zh_CN/vm/memory-model.rst delete mode 100644 Documentation/translations/zh_CN/vm/mmu_notifier.rst delete mode 100644 Documentation/translations/zh_CN/vm/numa.rst delete mode 100644 Documentation/translations/zh_CN/vm/overcommit-accounting.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_frags.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_owner.rst delete mode 100644 Documentation/translations/zh_CN/vm/page_table_check.rst delete mode 100644 Documentation/translations/zh_CN/vm/remap_file_pages.rst delete mode 100644 Documentation/translations/zh_CN/vm/split_page_table_lock.rst delete mode 100644 Documentation/translations/zh_CN/vm/z3fold.rst delete mode 100644 Documentation/translations/zh_CN/vm/zsmalloc.rst delete mode 100644 Documentation/vm/.gitignore delete mode 100644 Documentation/vm/active_mm.rst delete mode 100644 Documentation/vm/arch_pgtable_helpers.rst delete mode 100644 Documentation/vm/balance.rst delete mode 100644 Documentation/vm/bootmem.rst delete mode 100644 Documentation/vm/damon/api.rst delete mode 100644 Documentation/vm/damon/design.rst delete mode 100644 Documentation/vm/damon/faq.rst delete mode 100644 Documentation/vm/damon/index.rst delete mode 100644 Documentation/vm/free_page_reporting.rst delete mode 100644 Documentation/vm/frontswap.rst delete mode 100644 Documentation/vm/highmem.rst delete mode 100644 Documentation/vm/hmm.rst delete mode 100644 Documentation/vm/hugetlbfs_reserv.rst delete mode 100644 Documentation/vm/hwpoison.rst delete mode 100644 Documentation/vm/index.rst delete mode 100644 Documentation/vm/ksm.rst delete mode 100644 Documentation/vm/memory-model.rst delete mode 100644 Documentation/vm/mmu_notifier.rst delete mode 100644 Documentation/vm/numa.rst delete mode 100644 Documentation/vm/oom.rst delete mode 100644 Documentation/vm/overcommit-accounting.rst delete mode 100644 Documentation/vm/page_allocation.rst delete mode 100644 Documentation/vm/page_cache.rst delete mode 100644 Documentation/vm/page_frags.rst delete mode 100644 Documentation/vm/page_migration.rst delete mode 100644 Documentation/vm/page_owner.rst delete mode 100644 Documentation/vm/page_reclaim.rst delete mode 100644 Documentation/vm/page_table_check.rst delete mode 100644 Documentation/vm/page_tables.rst delete mode 100644 Documentation/vm/physical_memory.rst delete mode 100644 Documentation/vm/process_addrs.rst delete mode 100644 Documentation/vm/remap_file_pages.rst delete mode 100644 Documentation/vm/shmfs.rst delete mode 100644 Documentation/vm/slab.rst delete mode 100644 Documentation/vm/slub.rst delete mode 100644 Documentation/vm/split_page_table_lock.rst delete mode 100644 Documentation/vm/swap.rst delete mode 100644 Documentation/vm/transhuge.rst delete mode 100644 Documentation/vm/unevictable-lru.rst delete mode 100644 Documentation/vm/vmalloc.rst delete mode 100644 Documentation/vm/vmalloced-kernel-stacks.rst delete mode 100644 Documentation/vm/vmemmap_dedup.rst delete mode 100644 Documentation/vm/z3fold.rst delete mode 100644 Documentation/vm/zsmalloc.rst (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm index 1c9bed5595f5..d244674a9480 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm @@ -41,7 +41,7 @@ Description: Kernel Samepage Merging daemon sysfs interface sleep_millisecs: how many milliseconds ksm should sleep between scans. - See Documentation/vm/ksm.rst for more information. + See Documentation/mm/ksm.rst for more information. What: /sys/kernel/mm/ksm/merge_across_nodes Date: January 2013 diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index c440f4946e12..cd5fb8fa3ddf 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -37,7 +37,7 @@ Description: The alloc_calls file is read-only and lists the kernel code locations from which allocations for this cache were performed. The alloc_calls file only contains information if debugging is - enabled for that cache (see Documentation/vm/slub.rst). + enabled for that cache (see Documentation/mm/slub.rst). What: /sys/kernel/slab//alloc_fastpath Date: February 2008 @@ -219,7 +219,7 @@ Contact: Pekka Enberg , Description: The free_calls file is read-only and lists the locations of object frees if slab debugging is enabled (see - Documentation/vm/slub.rst). + Documentation/mm/slub.rst). What: /sys/kernel/slab//free_fastpath Date: February 2008 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..8c0ea6b6c6a9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5442,7 +5442,7 @@ cache (risks via metadata attacks are mostly unchanged). Debug options disable merging on their own. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slab_max_order= [MM, SLAB] Determines the maximum allowed order for slabs. @@ -5456,13 +5456,13 @@ slub_debug can create guard zones around objects and may poison objects when not in use. Also tracks the last alloc / free. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory fragmentation. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_min_objects= [MM, SLUB] The minimum number of objects per slab. SLUB will @@ -5471,12 +5471,12 @@ the number of objects indicated. The higher the number of objects the smaller the overhead of tracking slabs and the less frequently locks need to be acquired. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_min_order= [MM, SLUB] Determines the minimum page order for slabs. Must be lower than slub_max_order. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_merge [MM, SLUB] Same with slab_merge. diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst index b966fcff993b..c79f1e336222 100644 --- a/Documentation/admin-guide/mm/concepts.rst +++ b/Documentation/admin-guide/mm/concepts.rst @@ -125,7 +125,7 @@ processor. Each bank is referred to as a `node` and for each node Linux constructs an independent memory management subsystem. A node has its own set of zones, lists of free and used pages and various statistics counters. You can find more details about NUMA in -:ref:`Documentation/vm/numa.rst ` and in +:ref:`Documentation/mm/numa.rst ` and in :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `. Page cache diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 61aff88347f3..c4681fa69b9c 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -4,7 +4,7 @@ Monitoring Data Accesses ======================== -:doc:`DAMON ` allows light-weight data access monitoring. +:doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and optimize those. diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 46306f1f34b1..a8bd3bd29959 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -268,4 +268,4 @@ granularity reclamation. :: .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1bb7b72414b2..5540a3a40fc9 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -30,11 +30,11 @@ DAMON provides below interfaces for different users. `. This will be removed after next LTS kernel is released, so users should move to the :ref:`sysfs interface `. - *Kernel Space Programming Interface.* - :doc:`This ` is for kernel space programmers. Using this, + :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface - :doc:`document `. + :doc:`document `. .. _sysfs_interface: @@ -185,7 +185,7 @@ controls the monitoring overhead, exist. You can set and get the values by writing to and rading from the files. For more details about the intervals and monitoring regions range, please refer -to the Design document (:doc:`/vm/damon/design`). +to the Design document (:doc:`/mm/damon/design`). contexts//targets/ --------------------- @@ -402,7 +402,7 @@ Attributes Users can get and set the ``sampling interval``, ``aggregation interval``, ``update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. To know about the monitoring -attributes in detail, please refer to the :doc:`/vm/damon/design`. For +attributes in detail, please refer to the :doc:`/mm/damon/design`. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and 1000, and then check it again:: diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5c9aa171a0d3..4a440a7cfeb0 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -760,7 +760,7 @@ and don't use much of it. The default value is 0. -See Documentation/vm/overcommit-accounting.rst and +See Documentation/mm/overcommit-accounting.rst and mm/util.c::__vm_enough_memory() for more information. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index dedd4d853329..5b1188494bcd 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -87,7 +87,7 @@ Memory management ================= How to allocate and use memory in the kernel. Note that there is a lot -more memory-management documentation in Documentation/vm/index.rst. +more memory-management documentation in Documentation/mm/index.rst. .. toctree:: :maxdepth: 1 diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 1bc91fb8c321..8543a59f288f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1109,7 +1109,7 @@ CommitLimit yield a CommitLimit of 7.3G. For more details, see the memory overcommit documentation - in vm/overcommit-accounting. + in mm/overcommit-accounting. Committed_AS The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which diff --git a/Documentation/index.rst b/Documentation/index.rst index 67036a05b771..4737c18c97ff 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -128,7 +128,7 @@ needed). sound/index crypto/index filesystems/index - vm/index + mm/index bpf/index usb/index PCI/index diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst new file mode 100644 index 000000000000..6f8269c284ed --- /dev/null +++ b/Documentation/mm/active_mm.rst @@ -0,0 +1,91 @@ +.. _active_mm: + +========= +Active MM +========= + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + Cc'd to linux-kernel, because I don't write explanations all that often, + and when I do I feel better about more people reading them. + + On Fri, 30 Jul 1999, David Mosberger wrote: + > + > Is there a brief description someplace on how "mm" vs. "active_mm" in + > the task_struct are supposed to be used? (My apologies if this was + > discussed on the mailing lists---I just returned from vacation and + > wasn't able to follow linux-kernel for a while). + + Basically, the new setup is: + + - we have "real address spaces" and "anonymous address spaces". The + difference is that an anonymous address space doesn't care about the + user-level page tables at all, so when we do a context switch into an + anonymous address space we just leave the previous address space + active. + + The obvious use for a "anonymous address space" is any thread that + doesn't need any user mappings - all kernel threads basically fall into + this category, but even "real" threads can temporarily say that for + some amount of time they are not going to be interested in user space, + and that the scheduler might as well try to avoid wasting time on + switching the VM state around. Currently only the old-style bdflush + sync does that. + + - "tsk->mm" points to the "real address space". For an anonymous process, + tsk->mm will be NULL, for the logical reason that an anonymous process + really doesn't _have_ a real address space at all. + + - however, we obviously need to keep track of which address space we + "stole" for such an anonymous user. For that, we have "tsk->active_mm", + which shows what the currently active address space is. + + The rule is that for a process with a real address space (ie tsk->mm is + non-NULL) the active_mm obviously always has to be the same as the real + one. + + For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the + "borrowed" mm while the anonymous process is running. When the + anonymous process gets scheduled away, the borrowed address space is + returned and cleared. + + To support all that, the "struct mm_struct" now has two counters: a + "mm_users" counter that is how many "real address space users" there are, + and a "mm_count" counter that is the number of "lazy" users (ie anonymous + users) plus one if there are any real users. + + Usually there is at least one real user, but it could be that the real + user exited on another CPU while a lazy user was still active, so you do + actually get cases where you have a address space that is _only_ used by + lazy users. That is often a short-lived state, because once that thread + gets scheduled away in favour of a real thread, the "zombie" mm gets + released because "mm_count" becomes zero. + + Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any + more. "init_mm" should be considered just a "lazy context when no other + context is available", and in fact it is mainly used just at bootup when + no real VM has yet been created. So code that used to check + + if (current->mm == &init_mm) + + should generally just do + + if (!current->mm) + + instead (which makes more sense anyway - the test is basically one of "do + we have a user context", and is generally done by the page fault handler + and things like that). + + Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, + because it slightly changes the interfaces to accommodate the alpha (who + would have thought it, but the alpha actually ends up having one of the + ugliest context switch codes - unlike the other architectures where the MM + and register state is separate, the alpha PALcode joins the two, and you + need to switch both together). + + (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst new file mode 100644 index 000000000000..cbaee9e59241 --- /dev/null +++ b/Documentation/mm/arch_pgtable_helpers.rst @@ -0,0 +1,260 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _arch_page_table_helpers: + +=============================== +Architecture Page Table Helpers +=============================== + +Generic MM expects architectures (with MMU) to provide helpers to create, access +and modify page table entries at various level for different memory functions. +These page table helpers need to conform to a common semantics across platforms. +Following tables describe the expected semantics which can also be tested during +boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug +test need to be in sync. + + +PTE Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pte_same | Tests whether both PTE entries are the same | ++---------------------------+--------------------------------------------------+ +| pte_bad | Tests a non-table mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_present | Tests a valid mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_young | Tests a young PTE | ++---------------------------+--------------------------------------------------+ +| pte_dirty | Tests a dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_write | Tests a writable PTE | ++---------------------------+--------------------------------------------------+ +| pte_special | Tests a special PTE | ++---------------------------+--------------------------------------------------+ +| pte_protnone | Tests a PROT_NONE PTE | ++---------------------------+--------------------------------------------------+ +| pte_devmap | Tests a ZONE_DEVICE mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_soft_dirty | Tests a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_soft_dirty | Tests a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkyoung | Creates a young PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkold | Creates an old PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkdirty | Creates a dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkclean | Creates a clean PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkwrite | Creates a writable PTE | ++---------------------------+--------------------------------------------------+ +| pte_wrprotect | Creates a write protected PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkspecial | Creates a special PTE | ++---------------------------+--------------------------------------------------+ +| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mksoft_dirty | Creates a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_clear_soft_dirty | Clears a soft dirty PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE | ++---------------------------+--------------------------------------------------+ +| pte_mknotpresent | Invalidates a mapped PTE | ++---------------------------+--------------------------------------------------+ +| ptep_clear | Clears a PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear | Clears and returns PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | ++---------------------------+--------------------------------------------------+ +| ptep_test_and_clear_young | Clears young from a PTE | ++---------------------------+--------------------------------------------------+ +| ptep_set_wrprotect | Converts into a write protected PTE | ++---------------------------+--------------------------------------------------+ +| ptep_set_access_flags | Converts into a more permissive PTE | ++---------------------------+--------------------------------------------------+ + + +PMD Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pmd_same | Tests whether both PMD entries are the same | ++---------------------------+--------------------------------------------------+ +| pmd_bad | Tests a non-table mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_leaf | Tests a leaf mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_huge | Tests a HugeTLB mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | ++---------------------------+--------------------------------------------------+ +| pmd_present | Tests a valid mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_young | Tests a young PMD | ++---------------------------+--------------------------------------------------+ +| pmd_dirty | Tests a dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_write | Tests a writable PMD | ++---------------------------+--------------------------------------------------+ +| pmd_special | Tests a special PMD | ++---------------------------+--------------------------------------------------+ +| pmd_protnone | Tests a PROT_NONE PMD | ++---------------------------+--------------------------------------------------+ +| pmd_devmap | Tests a ZONE_DEVICE mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_soft_dirty | Tests a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkyoung | Creates a young PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkold | Creates an old PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkdirty | Creates a dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkclean | Creates a clean PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkwrite | Creates a writable PMD | ++---------------------------+--------------------------------------------------+ +| pmd_wrprotect | Creates a write protected PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkspecial | Creates a special PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mksoft_dirty | Creates a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_clear_soft_dirty | Clears a soft dirty PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD | ++---------------------------+--------------------------------------------------+ +| pmd_mkinvalid | Invalidates a mapped PMD [1] | ++---------------------------+--------------------------------------------------+ +| pmd_set_huge | Creates a PMD huge mapping | ++---------------------------+--------------------------------------------------+ +| pmd_clear_huge | Clears a PMD huge mapping | ++---------------------------+--------------------------------------------------+ +| pmdp_get_and_clear | Clears a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_get_and_clear_full | Clears a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_test_and_clear_young | Clears young from a PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_set_wrprotect | Converts into a write protected PMD | ++---------------------------+--------------------------------------------------+ +| pmdp_set_access_flags | Converts into a more permissive PMD | ++---------------------------+--------------------------------------------------+ + + +PUD Page Table Helpers +====================== + ++---------------------------+--------------------------------------------------+ +| pud_same | Tests whether both PUD entries are the same | ++---------------------------+--------------------------------------------------+ +| pud_bad | Tests a non-table mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_leaf | Tests a leaf mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_huge | Tests a HugeTLB mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | ++---------------------------+--------------------------------------------------+ +| pud_present | Tests a valid mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_young | Tests a young PUD | ++---------------------------+--------------------------------------------------+ +| pud_dirty | Tests a dirty PUD | ++---------------------------+--------------------------------------------------+ +| pud_write | Tests a writable PUD | ++---------------------------+--------------------------------------------------+ +| pud_devmap | Tests a ZONE_DEVICE mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkyoung | Creates a young PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkold | Creates an old PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkdirty | Creates a dirty PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkclean | Creates a clean PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkwrite | Creates a writable PUD | ++---------------------------+--------------------------------------------------+ +| pud_wrprotect | Creates a write protected PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | ++---------------------------+--------------------------------------------------+ +| pud_mkinvalid | Invalidates a mapped PUD [1] | ++---------------------------+--------------------------------------------------+ +| pud_set_huge | Creates a PUD huge mapping | ++---------------------------+--------------------------------------------------+ +| pud_clear_huge | Clears a PUD huge mapping | ++---------------------------+--------------------------------------------------+ +| pudp_get_and_clear | Clears a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_get_and_clear_full | Clears a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_test_and_clear_young | Clears young from a PUD | ++---------------------------+--------------------------------------------------+ +| pudp_set_wrprotect | Converts into a write protected PUD | ++---------------------------+--------------------------------------------------+ +| pudp_set_access_flags | Converts into a more permissive PUD | ++---------------------------+--------------------------------------------------+ + + +HugeTLB Page Table Helpers +========================== + ++---------------------------+--------------------------------------------------+ +| pte_huge | Tests a HugeTLB | ++---------------------------+--------------------------------------------------+ +| pte_mkhuge | Creates a HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_dirty | Tests a dirty HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_write | Tests a writable HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_mkdirty | Creates a dirty HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_mkwrite | Creates a writable HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_pte_wrprotect | Creates a write protected HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_get_and_clear | Clears a HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB | ++---------------------------+--------------------------------------------------+ +| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB | ++---------------------------+--------------------------------------------------+ + + +SWAP Page Table Helpers +======================== + ++---------------------------+--------------------------------------------------+ +| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE | ++---------------------------+--------------------------------------------------+ +| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) | ++---------------------------+--------------------------------------------------+ +| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD | ++---------------------------+--------------------------------------------------+ +| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | ++---------------------------+--------------------------------------------------+ +| is_migration_entry | Tests a migration (read or write) swapped entry | ++-------------------------------+----------------------------------------------+ +| is_writable_migration_entry | Tests a write migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_readable_migration_entry | Creates a read migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_writable_migration_entry | Creates a write migration swapped entry | ++-------------------------------+----------------------------------------------+ + +[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst new file mode 100644 index 000000000000..6a1fadf3e173 --- /dev/null +++ b/Documentation/mm/balance.rst @@ -0,0 +1,102 @@ +.. _balance: + +================ +Memory Balancing +================ + +Started Jan 2000 by Kanoj Sarcar + +Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +well as for non __GFP_IO allocations. + +The first reason why a caller may avoid reclaim is that the caller can not +sleep due to holding a spinlock or is in interrupt context. The second may +be that the caller is willing to fail the allocation without incurring the +overhead of page reclaim. This may happen for opportunistic high-order +allocation requests that have order-0 fallback options. In such cases, +the caller may also wish to avoid waking kswapd. + +__GFP_IO allocation requests are made to prevent file system deadlocks. + +In the absence of non sleepable allocation requests, it seems detrimental +to be doing balancing. Page reclamation can be kicked off lazily, that +is, only when needed (aka zone free memory is 0), instead of making it +a proactive process. + +That being said, the kernel should try to fulfill requests for direct +mapped pages from the direct mapped pool, instead of falling back on +the dma pool, so as to keep the dma pool filled for dma requests (atomic +or not). A similar argument applies to highmem and direct mapped pages. +OTOH, if there is a lot of free dma pages, it is preferable to satisfy +regular memory requests by allocating one from the dma pool, instead +of incurring the overhead of regular zone balancing. + +In 2.2, memory balancing/page reclamation would kick off only when the +_total_ number of free pages fell below 1/64 th of total memory. With the +right ratio of dma and regular memory, it is quite possible that balancing +would not be done even when the dma zone was completely empty. 2.2 has +been running production machines of varying memory sizes, and seems to be +doing fine even with the presence of this problem. In 2.3, due to +HIGHMEM, this problem is aggravated. + +In 2.3, zone balancing can be done in one of two ways: depending on the +zone size (and possibly of the size of lower class zones), we can decide +at init time how many free pages we should aim for while balancing any +zone. The good part is, while balancing, we do not need to look at sizes +of lower class zones, the bad part is, we might do too frequent balancing +due to ignoring possibly lower usage in the lower class zones. Also, +with a slight change in the allocation routine, it is possible to reduce +the memclass() macro to be a simple equality. + +Another possible solution is that we balance only when the free memory +of a zone _and_ all its lower class zones falls below 1/64th of the +total memory in the zone and its lower class zones. This fixes the 2.2 +balancing problem, and stays as close to 2.2 behavior as possible. Also, +the balancing algorithm works the same way on the various architectures, +which have different numbers and types of zones. If we wanted to get +fancy, we could assign different weights to free pages in different +zones in the future. + +Note that if the size of the regular zone is huge compared to dma zone, +it becomes less significant to consider the free dma pages while +deciding whether to balance the regular zone. The first solution +becomes more attractive then. + +The appended patch implements the second solution. It also "fixes" two +problems: first, kswapd is woken up as in 2.2 on low memory conditions +for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, +so as to give a fighting chance for replace_with_highmem() to get a +HIGHMEM page, as well as to ensure that HIGHMEM allocations do not +fall back into regular zone. This also makes sure that HIGHMEM pages +are not leaked (for example, in situations where a HIGHMEM page is in +the swapcache but is not being used by anyone) + +kswapd also needs to know about the zones it should balance. kswapd is +primarily needed in a situation where balancing can not be done, +probably because all allocation requests are coming from intr context +and all process contexts are sleeping. For 2.3, kswapd does not really +need to balance the highmem zone, since intr context does not request +highmem pages. kswapd looks at the zone_wake_kswapd field in the zone +structure to decide whether a zone needs balancing. + +Page stealing from process memory and shm is done if stealing the page would +alleviate memory pressure on any zone in the page's node that has fallen below +its watermark. + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +are per-zone fields, used to determine when a zone needs to be balanced. When +the number of pages falls below watermark[WMARK_MIN], the hysteric field +low_on_memory gets set. This stays set till the number of free pages becomes +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will +try to free some pages in the zone (providing GFP_WAIT is set in the request). +Orthogonal to this, is the decision to poke kswapd to free some zone pages. +That decision is not hysteresis based, and is done when the number of free +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. + + +(Good) Ideas that I have heard: + +1. Dynamic experience should influence balancing: number of failed requests + for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) +2. Implement a replace_with_highmem()-like replace_with_regular() to preserve + dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/mm/bootmem.rst b/Documentation/mm/bootmem.rst new file mode 100644 index 000000000000..eb2b31eedfa1 --- /dev/null +++ b/Documentation/mm/bootmem.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Boot Memory +=========== diff --git a/Documentation/mm/damon/api.rst b/Documentation/mm/damon/api.rst new file mode 100644 index 000000000000..08f34df45523 --- /dev/null +++ b/Documentation/mm/damon/api.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +API Reference +============= + +Kernel space programs can use every feature of DAMON using below APIs. All you +need to do is including ``damon.h``, which is located in ``include/linux/`` of +the source tree. + +Structures +========== + +.. kernel-doc:: include/linux/damon.h + + +Functions +========= + +.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst new file mode 100644 index 000000000000..0cff6fac6b7e --- /dev/null +++ b/Documentation/mm/damon/design.rst @@ -0,0 +1,176 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +Design +====== + +Configurable Layers +=================== + +DAMON provides data access monitoring functionality while making the accuracy +and the overhead controllable. The fundamental access monitorings require +primitives that dependent on and optimized for the target address space. On +the other hand, the accuracy and overhead tradeoff mechanism, which is the core +of DAMON, is in the pure logic space. DAMON separates the two parts in +different layers and defines its interface to allow various low level +primitives implementations configurable with the core logic. We call the low +level primitives implementations monitoring operations. + +Due to this separated design and the configurable interface, users can extend +DAMON for any address space by configuring the core logics with appropriate +monitoring operations. If appropriate one is not provided, users can implement +the operations on their own. + +For example, physical memory, virtual memory, swap space, those for specific +processes, NUMA nodes, files, and backing memory devices would be supportable. +Also, if some architectures or devices support special optimized access check +primitives, those will be easily configurable. + + +Reference Implementations of Address Space Specific Monitoring Operations +========================================================================= + +The monitoring operations are defined in two parts: + +1. Identification of the monitoring target address range for the address space. +2. Access check of specific address range in the target space. + +DAMON currently provides the implementations of the operations for the physical +and virtual address spaces. Below two subsections describe how those work. + + +VMA-based Target Address Range Construction +------------------------------------------- + +This is only for the virtual address space monitoring operations +implementation. That for the physical address space simply asks users to +manually set the monitoring target address ranges. + +Only small parts in the super-huge virtual address space of the processes are +mapped to the physical memory and accessed. Thus, tracking the unmapped +address regions is just wasteful. However, because DAMON can deal with some +level of noise using the adaptive regions adjustment mechanism, tracking every +mapping is not strictly required but could even incur a high overhead in some +cases. That said, too huge unmapped areas inside the monitoring target should +be removed to not take the time for the adaptive mechanism. + +For the reason, this implementation converts the complex mappings to three +distinct regions that cover every mapped area of the address space. The two +gaps between the three regions are the two biggest unmapped areas in the given +address space. The two biggest unmapped areas would be the gap between the +heap and the uppermost mmap()-ed region, and the gap between the lowermost +mmap()-ed region and the stack in most of the cases. Because these gaps are +exceptionally huge in usual address spaces, excluding these will be sufficient +to make a reasonable trade-off. Below shows this in detail:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +PTE Accessed-bit Based Access Check +----------------------------------- + +Both of the implementations for physical and virtual address spaces use PTE +Accessed-bit for basic access checks. Only one difference is the way of +finding the relevant PTE Accessed bit(s) from the address. While the +implementation for the virtual address walks the page table for the target task +of the address, the implementation for the physical address walks every page +table having a mapping to the address. In this way, the implementations find +and clear the bit(s) for next sampling target address and checks whether the +bit(s) set again after one sampling period. This could disturb other kernel +subsystems using the Accessed bits, namely Idle page tracking and the reclaim +logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling +the interference is the responsibility of sysadmins. However, it solves the +conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, +as Idle page tracking does. + + +Address Space Independent Core Mechanisms +========================================= + +Below four sections describe each of the DAMON core mechanisms and the five +monitoring attributes, ``sampling interval``, ``aggregation interval``, +``update interval``, ``minimum number of regions``, and ``maximum number of +regions``. + + +Access Frequency Monitoring +--------------------------- + +The output of DAMON says what pages are how frequently accessed for a given +duration. The resolution of the access frequency is controlled by setting +``sampling interval`` and ``aggregation interval``. In detail, DAMON checks +access to each page per ``sampling interval`` and aggregates the results. In +other words, counts the number of the accesses to each page. After each +``aggregation interval`` passes, DAMON calls callback functions that previously +registered by users so that users can read the aggregated results and then +clears the results. This can be described in below simple pseudo-code:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +The monitoring overhead of this mechanism will arbitrarily increase as the +size of the target workload grows. + + +Region Based Sampling +--------------------- + +To avoid the unbounded increase of the overhead, DAMON groups adjacent pages +that assumed to have the same access frequencies into a region. As long as the +assumption (pages in a region have the same access frequencies) is kept, only +one page in the region is required to be checked. Thus, for each ``sampling +interval``, DAMON randomly picks one page in each region, waits for one +``sampling interval``, checks whether the page is accessed meanwhile, and +increases the access frequency of the region if so. Therefore, the monitoring +overhead is controllable by setting the number of regions. DAMON allows users +to set the minimum and the maximum number of regions for the trade-off. + +This scheme, however, cannot preserve the quality of the output if the +assumption is not guaranteed. + + +Adaptive Regions Adjustment +--------------------------- + +Even somehow the initial monitoring target regions are well constructed to +fulfill the assumption (pages in same region have similar access frequencies), +the data access pattern can be dynamically changed. This will result in low +monitoring quality. To keep the assumption as much as possible, DAMON +adaptively merges and splits each region based on their access frequency. + +For each ``aggregation interval``, it compares the access frequencies of +adjacent regions and merges those if the frequency difference is small. Then, +after it reports and clears the aggregated access frequency of each region, it +splits each region into two or three regions if the total number of regions +will not exceed the user-specified maximum number of regions after the split. + +In this way, DAMON provides its best-effort quality and minimal overhead while +keeping the bounds users set for their trade-off. + + +Dynamic Target Space Updates Handling +------------------------------------- + +The monitoring target address range could dynamically changed. For example, +virtual memory could be dynamically mapped and unmapped. Physical memory could +be hot-plugged. + +As the changes could be quite frequent in some cases, DAMON allows the +monitoring operations to check dynamic changes including memory mapping changes +and applies it to monitoring operations-related data structures such as the +abstracted monitoring target memory area only for each of a user-specified time +interval (``update interval``). diff --git a/Documentation/mm/damon/faq.rst b/Documentation/mm/damon/faq.rst new file mode 100644 index 000000000000..dde7e2414ee6 --- /dev/null +++ b/Documentation/mm/damon/faq.rst @@ -0,0 +1,50 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Frequently Asked Questions +========================== + +Why a new subsystem, instead of extending perf or other user space tools? +========================================================================= + +First, because it needs to be lightweight as much as possible so that it can be +used online, any unnecessary overhead such as kernel - user space context +switching cost should be avoided. Second, DAMON aims to be used by other +programs including the kernel. Therefore, having a dependency on specific +tools like perf is not desirable. These are the two biggest reasons why DAMON +is implemented in the kernel space. + + +Can 'idle pages tracking' or 'perf mem' substitute DAMON? +========================================================= + +Idle page tracking is a low level primitive for access check of the physical +address space. 'perf mem' is similar, though it can use sampling to minimize +the overhead. On the other hand, DAMON is a higher-level framework for the +monitoring of various address spaces. It is focused on memory management +optimization and provides sophisticated accuracy/overhead handling mechanisms. +Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of +DAMON's output, but cannot substitute DAMON. + + +Does DAMON support virtual memory only? +======================================= + +No. The core of the DAMON is address space independent. The address space +specific monitoring operations including monitoring target regions +constructions and actual access checks can be implemented and configured on the +DAMON core by the users. In this way, DAMON users can monitor any address +space with any access check technique. + +Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based +implementations of the address space dependent functions for the virtual memory +and the physical memory by default, for a reference and convenient use. + + +Can I simply monitor page granularity? +====================================== + +Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the +working set size divided by the page size. Because the monitoring target +regions size is forced to be ``>=page size``, the region split will make no +effect. diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst new file mode 100644 index 000000000000..48c0bbff98b2 --- /dev/null +++ b/Documentation/mm/damon/index.rst @@ -0,0 +1,29 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +DAMON: Data Access MONitor +========================== + +DAMON is a data access monitoring framework subsystem for the Linux kernel. +The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it + + - *accurate* (the monitoring output is useful enough for DRAM level memory + management; It might not appropriate for CPU Cache levels, though), + - *light-weight* (the monitoring overhead is low enough to be applied online), + and + - *scalable* (the upper-bound of the overhead is in constant range regardless + of the size of target workloads). + +Using this framework, therefore, the kernel's memory management mechanisms can +make advanced decisions. Experimental memory management optimization works +that incurring high data accesses monitoring overhead could implemented again. +In user space, meanwhile, users who have some special workloads can write +personalized applications for better understanding and optimizations of their +workloads and systems. + +.. toctree:: + :maxdepth: 2 + + faq + design + api diff --git a/Documentation/mm/free_page_reporting.rst b/Documentation/mm/free_page_reporting.rst new file mode 100644 index 000000000000..8c05e62d8b2b --- /dev/null +++ b/Documentation/mm/free_page_reporting.rst @@ -0,0 +1,40 @@ +.. _free_page_reporting: + +===================== +Free Page Reporting +===================== + +Free page reporting is an API by which a device can register to receive +lists of pages that are currently unused by the system. This is useful in +the case of virtualization where a guest is then able to use this data to +notify the hypervisor that it is no longer using certain pages in memory. + +For the driver, typically a balloon driver, to use of this functionality +it will allocate and initialize a page_reporting_dev_info structure. The +field within the structure it will populate is the "report" function +pointer used to process the scatterlist. It must also guarantee that it can +handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per +call to the function. A call to page_reporting_register will register the +page reporting interface with the reporting framework assuming no other +page reporting devices are already registered. + +Once registered the page reporting API will begin reporting batches of +pages to the driver. The API will start reporting pages 2 seconds after +the interface is registered and will continue to do so 2 seconds after any +page of a sufficiently high order is freed. + +Pages reported will be stored in the scatterlist passed to the reporting +function with the final entry having the end bit set in entry nent - 1. +While pages are being processed by the report function they will not be +accessible to the allocator. Once the report function has been completed +the pages will be returned to the free area from which they were obtained. + +Prior to removing a driver that is making use of free page reporting it +is necessary to call page_reporting_unregister to have the +page_reporting_dev_info structure that is currently in use by free page +reporting removed. Doing this will prevent further reports from being +issued via the interface. If another driver or the same driver is +registered it is possible for it to resume where the previous driver had +left off in terms of reporting free pages. + +Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst new file mode 100644 index 000000000000..feecc5e24477 --- /dev/null +++ b/Documentation/mm/frontswap.rst @@ -0,0 +1,266 @@ +.. _frontswap: + +========= +Frontswap +========= + +Frontswap provides a "transcendent memory" interface for swap pages. +In some environments, dramatic performance savings may be obtained because +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming +to the requirements of transcendent memory (such as Xen's "tmem", or +in-kernel compressed memory, aka "zcache", or future RAM-like devices); +this pseudo-RAM device is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. The driver +links itself to frontswap by calling frontswap_register_ops to set the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the device to receive frontswap pages associated +with the specified swap device number (aka "type"). A "store" will +copy the page to transcendent memory and associate it with the type and +offset associated with the page. A "load" will copy the page, if found, +from transcendent memory into kernel memory, but will NOT remove the page +from transcendent memory. An "invalidate_page" will remove the page +from transcendent memory and an "invalidate_area" will remove ALL pages +associated with the swap type (e.g., like swapoff) and notify the "device" +to refuse further stores with that swap type. + +Once a page is successfully stored, a matching load on the page will normally +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the store returns +success, the data has been successfully saved to transcendent memory and +a disk write and, if the data is later read back, a disk read are avoided. +If a store returns failure, transcendent memory has rejected the data, and the +page can be written to swap as usual. + +Note that if a page is stored and the page already exists in transcendent memory +(a "duplicate" store), either the store succeeds and the data is overwritten, +or the store fails AND the page is invalidated. This ensures stale data may +never be obtained from frontswap. + +If properly configured, monitoring of frontswap is done via debugfs in +the `/sys/kernel/debug/frontswap` directory. The effectiveness of +frontswap can be measured (across all swap devices) with: + +``failed_stores`` + how many store attempts have failed + +``loads`` + how many loads were attempted (all should succeed) + +``succ_stores`` + how many store attempts have succeeded + +``invalidates`` + how many invalidates were attempted + +A backend implementation may provide additional metrics. + +FAQ +=== + +* Where's the value? + +When a workload starts swapping, performance falls through the floor. +Frontswap significantly increases performance in many such workloads by +providing a clean, dynamic interface to read and write swap pages to +"transcendent memory" that is otherwise not directly addressable to the kernel. +This interface is ideal when data is transformed to a different form +and size (such as with compression) or secretly moved (as might be +useful for write-balancing for some RAM-like devices). Swap pages (and +evicted page-cache pages) are a great use for this kind of slower-than-RAM- +but-much-faster-than-disk "pseudo-RAM device". + +Frontswap with a fairly small impact on the kernel, +provides a huge amount of flexibility for more dynamic, flexible RAM +utilization in various system configurations: + +In the single kernel case, aka "zcache", pages are compressed and +stored in local memory, thus increasing the total anonymous pages +that can be safely kept in RAM. Zcache essentially trades off CPU +cycles used in compression/decompression for better memory utilization. +Benchmarks have shown little or no impact when memory pressure is +low while providing a significant performance improvement (25%+) +on some workloads under high memory pressure. + +"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory +support for clustered systems. Frontswap pages are locally compressed +as in zcache, but then "remotified" to another system's RAM. This +allows RAM to be dynamically load-balanced back-and-forth as needed, +i.e. when system A is overcommitted, it can swap to system B, and +vice versa. RAMster can also be configured as a memory server so +many servers in a cluster can swap, dynamically as needed, to a single +server configured with a large amount of RAM... without pre-configuring +how much of the RAM is available for each of the clients! + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources across the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to do +it well with no kernel changes have essentially failed (except in some +well-publicized special-case workloads). +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "selfballooning"), sudden unexpected +memory pressure may result in swapping; frontswap allows those pages +to be swapped to and from hypervisor RAM (if overall host system memory +conditions allow), thus mitigating the potentially awful performance impact +of unplanned swapping. + +A KVM implementation is underway and has been RFC'ed to lkml. And, +using frontswap, investigation is also underway on the use of NVM as +a memory extension technology. + +* Sure there may be performance advantages in some situations, but + what's the space/time overhead of frontswap? + +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into +nothingness and the only overhead is a few extra bytes per swapon'ed +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" +registers, there is one extra global variable compared to zero for +every swap page read or written. If CONFIG_FRONTSWAP is enabled +AND a frontswap backend registers AND the backend fails every "store" +request (i.e. provides no memory despite claiming it might), +CPU overhead is still negligible -- and since every frontswap fail +precedes a swap page write-to-disk, the system is highly likely +to be I/O bound and using a small fraction of a percent of a CPU +will be irrelevant anyway. + +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend +registers, one bit is allocated for every swap page for every swap +device that is swapon'd. This is added to the EIGHT bits (which +was sixteen until about 2.6.34) that the kernel already allocates +for every swap page for every swap device that is swapon'd. (Hugh +Dickins has observed that frontswap could probably steal one of +the existing eight bits, but let's worry about that minor optimization +later.) For very large swap disks (which are rare) on a standard +4K pagesize, this is 1MB per 32GB swap. + +When swap pages are stored in transcendent memory instead of written +out to disk, there is a side effect that this may create more memory +pressure that can potentially outweigh the other advantages. A +backend, such as zcache, must implement policies to carefully (but +dynamically) manage memory limits to ensure this doesn't happen. + +* OK, how about a quick overview of what this frontswap patch does + in terms that a kernel hacker can grok? + +Let's assume that a frontswap "backend" has registered during +kernel initialization; this registration indicates that this +frontswap backend has access to some "memory" that is not directly +accessible by the kernel. Exactly how much memory it provides is +entirely dynamic and random. + +Whenever a swap-device is swapon'd frontswap_init() is called, +passing the swap device number (aka "type") as a parameter. +This notifies frontswap to expect attempts to "store" swap pages +associated with that number. + +Whenever the swap subsystem is readying a page to write to a swap +device (c.f swap_writepage()), frontswap_store is called. Frontswap +consults with the frontswap backend and if the backend says it does NOT +have room, frontswap_store returns -1 and the kernel swaps the page +to the swap device as normal. Note that the response from the frontswap +backend is unpredictable to the kernel; it may choose to never accept a +page, it could accept every ninth page, or it might accept every +page. But if the backend does accept a page, the data from the page +has already been copied and associated with the type and offset, +and the backend guarantees the persistence of the data. In this case, +frontswap sets a bit in the "frontswap_map" for the swap device +corresponding to the page offset on the swap device to which it would +otherwise have written the data. + +When the swap subsystem needs to swap-in a page (swap_readpage()), +it first calls frontswap_load() which checks the frontswap_map to +see if the page was earlier accepted by the frontswap backend. If +it was, the page of data is filled from the frontswap backend and +the swap-in is complete. If not, the normal swap-in code is +executed to obtain the page of data from the real swap device. + +So every time the frontswap backend accepts a page, a swap device read +and (potentially) a swap device write are replaced by a "frontswap backend +store" and (possibly) a "frontswap backend loads", which are presumably much +faster. + +* Can't frontswap be configured as a "special" swap device that is + just higher priority than any real swap device (e.g. like zswap, + or maybe swap-over-nbd/NFS)? + +No. First, the existing swap subsystem doesn't allow for any kind of +swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, +but this would require fairly drastic changes. Even if it were +rewritten, the existing swap subsystem uses the block I/O layer which +assumes a swap device is fixed size and any page in it is linearly +addressable. Frontswap barely touches the existing swap subsystem, +and works around the constraints of the block I/O subsystem to provide +a great deal of flexibility and dynamicity. + +For example, the acceptance of any swap page by the frontswap backend is +entirely unpredictable. This is critical to the definition of frontswap +backends because it grants completely dynamic discretion to the +backend. In zcache, one cannot know a priori how compressible a page is. +"Poorly" compressible pages can be rejected, and "poorly" can itself be +defined dynamically depending on current memory constraints. + +Further, frontswap is entirely synchronous whereas a real swap +device is, by definition, asynchronous and uses block I/O. The +block I/O layer is not only unnecessary, but may perform "optimizations" +that are inappropriate for a RAM-oriented device including delaying +the write of some pages for a significant amount of time. Synchrony is +required to ensure the dynamicity of the backend and to avoid thorny race +conditions that would unnecessarily and greatly complicate frontswap +and/or the block I/O subsystem. That said, only the initial "store" +and "load" operations need be synchronous. A separate asynchronous thread +is free to manipulate the pages stored by frontswap. For example, +the "remotification" thread in RAMster uses standard asynchronous +kernel sockets to move compressed frontswap pages to a remote machine. +Similarly, a KVM guest-side implementation could do in-guest compression +and use "batched" hypercalls. + +In a virtualized environment, the dynamicity allows the hypervisor +(or host OS) to do "intelligent overcommit". For example, it can +choose to accept pages only until host-swapping might be imminent, +then force guests to do their own swapping. + +There is a downside to the transcendent memory specifications for +frontswap: Since any "store" might fail, there must always be a real +slot on a real swap device to swap the page. Thus frontswap must be +implemented as a "shadow" to every swapon'd device with the potential +capability of holding every page that the swap device might have held +and the possibility that it might hold no pages at all. This means +that frontswap cannot contain more pages than the total of swapon'd +swap devices. For example, if NO swap device is configured on some +installation, frontswap is useless. Swapless portable devices +can still use frontswap but a backend for such devices must configure +some kind of "ghost" swap device and ensure that it is never used. + +* Why this weird definition about "duplicate stores"? If a page + has been previously successfully stored, can't it always be + successfully overwritten? + +Nearly always it can, but no, sometimes it cannot. Consider an example +where data is compressed and the original 4K page has been compressed +to 1K. Now an attempt is made to overwrite the page with data that +is non-compressible and so would take the entire 4K. But the backend +has no more space. In this case, the store must be rejected. Whenever +frontswap rejects a store that would overwrite, it also must invalidate +the old data and ensure that it is no longer accessible. Since the +swap subsystem then writes the new data to the read swap device, +this is the correct course of action to ensure coherency. + +* Why does the frontswap patch create the new include file swapfile.h? + +The frontswap code depends on some swap-subsystem-internal data +structures that have, over the years, moved back and forth between +static and global. This seemed a reasonable compromise: Define +them as global but declare them in a new include file that isn't +included by the large number of source files that include swap.h. + +Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst new file mode 100644 index 000000000000..c9887f241c6c --- /dev/null +++ b/Documentation/mm/highmem.rst @@ -0,0 +1,167 @@ +.. _highmem: + +==================== +High Memory Handling +==================== + +By: Peter Zijlstra + +.. contents:: :local: + +What Is High Memory? +==================== + +High memory (highmem) is used when the size of physical memory approaches or +exceeds the maximum size of virtual memory. At that point it becomes +impossible for the kernel to keep all of the available physical memory mapped +at all times. This means the kernel needs to start using temporary mappings of +the pieces of physical memory that it wants to access. + +The part of (physical) memory not covered by a permanent mapping is what we +refer to as 'highmem'. There are various architecture dependent constraints on +where exactly that border lies. + +In the i386 arch, for example, we choose to map the kernel into every process's +VM space so that we don't have to pay the full TLB invalidation costs for +kernel entry/exit. This means the available virtual memory space (4GiB on +i386) has to be divided between user and kernel space. + +The traditional split for architectures using this approach is 3:1, 3GiB for +userspace and the top 1GiB for kernel space:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +This means that the kernel can at most map 1GiB of physical memory at any one +time, but because we need virtual address space for other things - including +temporary maps to access the rest of the physical memory - the actual direct +map will typically be less (usually around ~896MiB). + +Other architectures that have mm context tagged TLBs can have separate kernel +and user maps. Some hardware (like some ARMs), however, have limited virtual +space when they use mm context tags. + + +Temporary Virtual Mappings +========================== + +The kernel contains several ways of creating temporary mappings. The following +list shows them in order of preference of use. + +* kmap_local_page(). This function is used to require short term mappings. + It can be invoked from any context (including interrupts) but the mappings + can only be used in the context which acquired them. + + This function should be preferred, where feasible, over all the others. + + These mappings are thread-local and CPU-local, meaning that the mapping + can only be accessed from within this thread and the thread is bound the + CPU while the mapping is active. Even if the thread is preempted (since + preemption is never disabled by the function) the CPU can not be + unplugged from the system via CPU-hotplug until the mapping is disposed. + + It's valid to take pagefaults in a local kmap region, unless the context + in which the local mapping is acquired does not allow it for other reasons. + + kmap_local_page() always returns a valid virtual address and it is assumed + that kunmap_local() will never fail. + + Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain + extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered + because the map implementation is stack based. See kmap_local_page() kdocs + (included in the "Functions" section) for details on how to manage nested + mappings. + +* kmap_atomic(). This permits a very short duration mapping of a single + page. Since the mapping is restricted to the CPU that issued it, it + performs well, but the issuing task is therefore required to stay on that + CPU until it has finished, lest some other task displace its mappings. + + kmap_atomic() may also be used by interrupt contexts, since it does not + sleep and the callers too may not sleep until after kunmap_atomic() is + called. + + Each call of kmap_atomic() in the kernel creates a non-preemptible section + and disable pagefaults. This could be a source of unwanted latency. Therefore + users should prefer kmap_local_page() instead of kmap_atomic(). + + It is assumed that k[un]map_atomic() won't fail. + +* kmap(). This should be used to make short duration mapping of a single + page with no restrictions on preemption or migration. It comes with an + overhead as mapping space is restricted and protected by a global lock + for synchronization. When mapping is no longer needed, the address that + the page was mapped to must be released with kunmap(). + + Mapping changes must be propagated across all the CPUs. kmap() also + requires global TLB invalidation when the kmap's pool wraps and it might + block when the mapping space is fully utilized until a slot becomes + available. Therefore, kmap() is only callable from preemptible context. + + All the above work is necessary if a mapping must last for a relatively + long time but the bulk of high-memory mappings in the kernel are + short-lived and only used in one place. This means that the cost of + kmap() is mostly wasted in such cases. kmap() was not intended for long + term mappings but it has morphed in that direction and its use is + strongly discouraged in newer code and the set of the preceding functions + should be preferred. + + On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have + no real work to do because a 64-bit address space is more than sufficient to + address all the physical memory whose pages are permanently mapped. + +* vmap(). This can be used to make a long duration mapping of multiple + physical pages into a contiguous virtual space. It needs global + synchronization to unmap. + + +Cost of Temporary Mappings +========================== + +The cost of creating temporary mappings can be quite high. The arch has to +manipulate the kernel's page tables, the data TLB and/or the MMU's registers. + +If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping +simply with a bit of arithmetic that will convert the page struct address into +a pointer to the page contents rather than juggling mappings about. In such a +case, the unmap operation may be a null operation. + +If CONFIG_MMU is not set, then there can be no temporary mappings and no +highmem. In such a case, the arithmetic approach will also be used. + + +i386 PAE +======== + +The i386 arch, under some circumstances, will permit you to stick up to 64GiB +of RAM into your 32-bit machine. This has a number of consequences: + +* Linux needs a page-frame structure for each page in the system and the + pageframes need to live in the permanent mapping, which means: + +* you can have 896M/sizeof(struct page) page-frames at most; with struct + page being 32-bytes that would end up being something in the order of 112G + worth of pages; the kernel, however, needs to store more than just + page-frames in that memory... + +* PAE makes your page tables larger - which slows the system down as more + data has to be accessed to traverse in TLB fills and the like. One + advantage is that PAE has more PTE bits and can provide advanced features + like NX and PAT. + +The general recommendation is that you don't use more than 8GiB on a 32-bit +machine - although more might work for you and your workload, you're pretty +much on your own - don't expect kernel developers to really care much if things +come apart. + + +Functions +========= + +.. kernel-doc:: include/linux/highmem.h +.. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst new file mode 100644 index 000000000000..f2a59ed82ed3 --- /dev/null +++ b/Documentation/mm/hmm.rst @@ -0,0 +1,452 @@ +.. _hmm: + +===================================== +Heterogeneous Memory Management (HMM) +===================================== + +Provide infrastructure and helpers to integrate non-conventional memory (device +memory like GPU on board memory) into regular kernel path, with the cornerstone +of this being specialized struct page for such memory (see sections 5 to 7 of +this document). + +HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., +allowing a device to transparently access program addresses coherently with +the CPU meaning that any valid pointer on the CPU is also a valid pointer +for the device. This is becoming mandatory to simplify the use of advanced +heterogeneous computing where GPU, DSP, or FPGA are used to perform various +computations on behalf of a process. + +This document is divided as follows: in the first section I expose the problems +related to using device specific memory allocators. In the second section, I +expose the hardware limitations that are inherent to many platforms. The third +section gives an overview of the HMM design. The fourth section explains how +CPU page-table mirroring works and the purpose of HMM in this context. The +fifth section deals with how device memory is represented inside the kernel. +Finally, the last section presents a new migration helper that allows +leveraging the device DMA engine. + +.. contents:: :local: + +Problems of using a device specific memory allocator +==================================================== + +Devices with a large amount of on board memory (several gigabytes) like GPUs +have historically managed their memory through dedicated driver specific APIs. +This creates a disconnect between memory allocated and managed by a device +driver and regular application memory (private anonymous, shared memory, or +regular file backed memory). From here on I will refer to this aspect as split +address space. I use shared address space to refer to the opposite situation: +i.e., one in which any application memory region can be used by a device +transparently. + +Split address space happens because devices can only access memory allocated +through a device specific API. This implies that all memory objects in a program +are not equal from the device point of view which complicates large programs +that rely on a wide set of libraries. + +Concretely, this means that code that wants to leverage devices like GPUs needs +to copy objects between generically allocated memory (malloc, mmap private, mmap +share) and memory allocated through the device driver API (this still ends up +with an mmap but of the device file). + +For flat data sets (array, grid, image, ...) this isn't too hard to achieve but +for complex data sets (list, tree, ...) it's hard to get right. Duplicating a +complex data set needs to re-map all the pointer relations between each of its +elements. This is error prone and programs get harder to debug because of the +duplicate data set and addresses. + +Split address space also means that libraries cannot transparently use data +they are getting from the core program or another library and thus each library +might have to duplicate its input data set using the device specific memory +allocator. Large projects suffer from this and waste resources because of the +various memory copies. + +Duplicating each library API to accept as input or output memory allocated by +each device specific allocator is not a viable option. It would lead to a +combinatorial explosion in the library entry points. + +Finally, with the advance of high level language constructs (in C++ but in +other languages too) it is now possible for the compiler to leverage GPUs and +other devices without programmer knowledge. Some compiler identified patterns +are only do-able with a shared address space. It is also more reasonable to use +a shared address space for all other patterns. + + +I/O bus, device memory characteristics +====================================== + +I/O buses cripple shared address spaces due to a few limitations. Most I/O +buses only allow basic memory access from device to main memory; even cache +coherency is often optional. Access to device memory from a CPU is even more +limited. More often than not, it is not cache coherent. + +If we only consider the PCIE bus, then a device can access main memory (often +through an IOMMU) and be cache coherent with the CPUs. However, it only allows +a limited set of atomic operations from the device on main memory. This is worse +in the other direction: the CPU can only access a limited range of the device +memory and cannot perform atomic operations on it. Thus device memory cannot +be considered the same as regular memory from the kernel point of view. + +Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 +and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). +The final limitation is latency. Access to main memory from the device has an +order of magnitude higher latency than when the device accesses its own memory. + +Some platforms are developing new I/O buses or additions/modifications to PCIE +to address some of these limitations (OpenCAPI, CCIX). They mainly allow +two-way cache coherency between CPU and device and allow all atomic operations the +architecture supports. Sadly, not all platforms are following this trend and +some major architectures are left without hardware solutions to these problems. + +So for shared address space to make sense, not only must we allow devices to +access any memory but we must also permit any memory to be migrated to device +memory while the device is using it (blocking CPU access while it happens). + + +Shared address space and migration +================================== + +HMM intends to provide two main features. The first one is to share the address +space by duplicating the CPU page table in the device page table so the same +address points to the same physical memory for any valid main memory address in +the process address space. + +To achieve this, HMM offers a set of helpers to populate the device page table +while keeping track of CPU page table updates. Device page table updates are +not as easy as CPU page table updates. To update the device page table, you must +allocate a buffer (or use a pool of pre-allocated buffers) and write GPU +specific commands in it to perform the update (unmap, cache invalidations, and +flush, ...). This cannot be done through common code for all devices. Hence +why HMM provides helpers to factor out everything that can be while leaving the +hardware specific details to the device driver. + +The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that +allows allocating a struct page for each page of device memory. Those pages +are special because the CPU cannot map them. However, they allow migrating +main memory to device memory using existing migration mechanisms and everything +looks like a page that is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm +mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +memory for the device memory and second to perform migration. Policy decisions +of what and when to migrate is left to the device driver. + +Note that any CPU access to a device page triggers a page fault and a migration +back to main memory. For example, when a page backing a given CPU address A is +migrated from a main memory page to a device page, then any CPU access to +address A triggers a page fault and initiates a migration back to main memory. + +With these two features, HMM not only allows a device to mirror process address +space and keeps both CPU and device page tables synchronized, but also +leverages device memory by migrating the part of the data set that is actively being +used by the device. + + +Address space mirroring implementation and API +============================================== + +Address space mirroring's main objective is to allow duplication of a range of +CPU page table into a device page table; HMM helps keep both synchronized. A +device driver that wants to mirror a process address space must start with the +registration of a mmu_interval_notifier:: + + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); + +During the ops->invalidate() callback the device driver must perform the +update action to the range (mark range read only, or fully unmap, etc.). The +device must complete the update before the driver callback returns. + +When the device driver wants to populate a range of virtual addresses, it can +use:: + + int hmm_range_fault(struct hmm_range *range); + +It will trigger a page fault on missing or read-only entries if write access is +requested (see below). Page faults use the generic mm page fault code path just +like a CPU page fault. + +Both functions copy CPU page table entries into their pfns array argument. Each +entry in that array corresponds to an address in the virtual range. HMM +provides a set of flags to help the driver identify special CPU page table +entries. + +Locking within the sync_cpu_device_pagetables() callback is the most important +aspect the driver must respect in order to keep things properly synchronized. +The usage pattern is:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub->notifier.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + mmap_read_lock(mm); + ret = hmm_range_fault(&range); + if (ret) { + mmap_read_unlock(mm); + if (ret == -EBUSY) + goto again; + return ret; + } + mmap_read_unlock(mm); + + take_lock(driver->update); + if (mmu_interval_read_retry(&ni, range.notifier_seq) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + +The driver->update lock is the same lock that the driver takes inside its +invalidate() callback. That lock must be held before calling +mmu_interval_read_retry() to avoid any race with a concurrent CPU page table +update. + +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify +fault or snapshot policy for the whole range instead of having to set them +for each entry in the pfns array. + +For instance if the device driver wants pages for a range with at least read +permission, it sets:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all pages +in the range with at least read permission. + +Now let's say the driver wants to do the same except for one page in the range for +which it wants to have write permission. Now driver set:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; + +With this, HMM will fault in all pages with at least read (i.e., valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission i.e., if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +After hmm_range_fault completes the flag bits are set to the current state of +the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is +writable. + + +Represent and manage device memory from core kernel point of view +================================================================= + +Several different designs were tried to support device memory. The first one +used a device specific data structure to keep information about migrated memory +and HMM hooked itself in various places of mm code to handle any access to +addresses that were backed by device memory. It turns out that this ended up +replicating most of the fields of struct page and also needed many kernel code +paths to be updated to understand this new kind of memory. + +Most kernel code paths never try to access the memory behind a page +but only care about struct page contents. Because of this, HMM switched to +directly using struct page for device memory which left most kernel code paths +unaware of the difference. We only need to make sure that no one ever tries to +map those pages from the CPU side. + +Migration to and from device memory +=================================== + +Because the CPU cannot access device memory directly, the device driver must +use hardware DMA or device specific load/store instructions to migrate data. +The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() +functions are designed to make drivers easier to write and to centralize common +code across drivers. + +Before migrating pages to device private memory, special device private +``struct page`` need to be created. These will be used as special "swap" +page table entries so that a CPU process will fault if it tries to access +a page that has been migrated to device private memory. + +These can be allocated and freed with:: + + struct resource *res; + struct dev_pagemap pagemap; + + res = request_free_mem_region(&iomem_resource, /* number of bytes */, + "name of driver resource"); + pagemap.type = MEMORY_DEVICE_PRIVATE; + pagemap.range.start = res->start; + pagemap.range.end = res->end; + pagemap.nr_range = 1; + pagemap.ops = &device_devmem_ops; + memremap_pages(&pagemap, numa_node_id()); + + memunmap_pages(&pagemap); + release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + +There are also devm_request_free_mem_region(), devm_memremap_pages(), +devm_memunmap_pages(), and devm_release_mem_region() when the resources can +be tied to a ``struct device``. + +The overall migration steps are similar to migrating NUMA pages within system +memory (see :ref:`Page migration `) but the steps are split +between device driver specific code and shared common code: + +1. ``mmap_read_lock()`` + + The device driver has to pass a ``struct vm_area_struct`` to + migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to + be held for the duration of the migration. + +2. ``migrate_vma_setup(struct migrate_vma *args)`` + + The device driver initializes the ``struct migrate_vma`` fields and passes + the pointer to migrate_vma_setup(). The ``args->flags`` field is used to + filter which source pages should be migrated. For example, setting + ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and + ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in + device private memory. If the latter flag is set, the ``args->pgmap_owner`` + field is used to identify device private pages owned by the driver. This + avoids trying to migrate device private pages residing in other devices. + Currently only anonymous private VMA ranges can be migrated to or from + system memory and device private memory. + + One of the first steps migrate_vma_setup() does is to invalidate other + device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and + ``mmu_notifier_invalidate_range_end()`` calls around the page table + walks to fill in the ``args->src`` array with PFNs to be migrated. + The ``invalidate_range_start()`` callback is passed a + ``struct mmu_notifier_range`` with the ``event`` field set to + ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to + the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is + allows the device driver to skip the invalidation callback and only + invalidate device private MMU mappings that are actually migrating. + This is explained more in the next section. + + While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()`` + entry results in a valid "zero" PFN stored in the ``args->src`` array. + This lets the driver allocate device private memory and clear it instead + of copying a page of zeros. Valid PTE entries to system memory or + device private struct pages will be locked with ``lock_page()``, isolated + from the LRU (if system memory since device private pages are not on + the LRU), unmapped from the process, and a special migration PTE is + inserted in place of the original PTE. + migrate_vma_setup() also clears the ``args->dst`` array. + +3. The device driver allocates destination pages and copies source pages to + destination pages. + + The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE`` + bit is set and skips entries that are not migrating. The device driver + can also choose to skip migrating a page by not filling in the ``dst`` + array for that page. + + The driver then allocates either a device private struct page or a + system memory page, locks the page with ``lock_page()``, and fills in the + ``dst`` array entry with:: + + dst[i] = migrate_pfn(page_to_pfn(dpage)); + + Now that the driver knows that this page is being migrated, it can + invalidate device private MMU mappings and copy device private memory + to system memory or another device private page. The core Linux kernel + handles CPU page table invalidations so the device driver only has to + invalidate its own MMU mappings. + + The driver can use ``migrate_pfn_to_page(src[i])`` to get the + ``struct page`` of the source and either copy the source page to the + destination or clear the destination device private memory if the pointer + is ``NULL`` meaning the source page was not populated in system memory. + +4. ``migrate_vma_pages()`` + + This step is where the migration is actually "committed". + + If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this + is where the newly allocated page is inserted into the CPU's page table. + This can fail if a CPU thread faults on the same page. However, the page + table is locked and only one of the new pages will be inserted. + The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared + if it loses the race. + + If the source page was locked, isolated, etc. the source ``struct page`` + information is now copied to destination ``struct page`` finalizing the + migration on the CPU side. + +5. Device driver updates device MMU page tables for pages still migrating, + rolling back pages not migrating. + + If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device + driver can update the device MMU and set the write enable bit if the + ``MIGRATE_PFN_WRITE`` bit is set. + +6. ``migrate_vma_finalize()`` + + This step replaces the special migration page table entry with the new + page's page table entry and releases the reference to the source and + destination ``struct page``. + +7. ``mmap_read_unlock()`` + + The lock can now be released. + +Exclusive access memory +======================= + +Some devices have features such as atomic PTE bits that can be used to implement +atomic access to system memory. To support atomic operations to a shared virtual +memory page such a device needs access to that page which is exclusive of any +userspace access from the CPU. The ``make_device_exclusive_range()`` function +can be used to make a memory range inaccessible from userspace. + +This replaces all mappings for pages in the given range with special swap +entries. Any attempt to access the swap entry results in a fault which is +resovled by replacing the entry with the original mapping. A driver gets +notified that the mapping has been changed by MMU notifiers, after which point +it will no longer have exclusive access to the page. Exclusive access is +guranteed to last until the driver drops the page lock and page reference, at +which point any CPU faults on the page may proceed as described. + +Memory cgroup (memcg) and rss accounting +======================================== + +For now, device memory is accounted as any regular page in rss counters (either +anonymous if device page is used for anonymous, file if device page is used for +file backed page, or shmem if device page is used for shared memory). This is a +deliberate choice to keep existing applications, that might start using device +memory without knowing about it, running unimpacted. + +A drawback is that the OOM killer might kill an application using a lot of +device memory and not a lot of regular system memory and thus not freeing much +system memory. We want to gather more real world experience on how applications +and system react under memory pressure in the presence of device memory before +deciding to account device memory differently. + + +Same decision was made for memory cgroup. Device memory pages are accounted +against same memory cgroup a regular page would be accounted to. This does +simplify migration to and from device memory. This also means that migration +back from device memory to regular memory cannot fail because it would +go above memory cgroup limit. We might revisit this choice latter on once we +get more experience in how device memory is used and its impact on memory +resource control. + + +Note that device memory can never be pinned by a device driver nor through GUP +and thus such memory is always free upon process exit. Or when last reference +is dropped in case of shared memory or file backed memory. diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..f143954e0d05 --- /dev/null +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -0,0 +1,596 @@ +.. _hugetlbfs_reserve: + +===================== +Hugetlbfs Reservation +===================== + +Overview +======== + +Huge pages as described at :ref:`hugetlbpage` are typically +preallocated for application use. These huge pages are instantiated in a +task's address space at page fault time if the VMA indicates huge pages are +to be used. If no huge page exists at page fault time, the task is sent +a SIGBUS and often dies an unhappy death. Shortly after huge page support +was added, it was determined that it would be better to detect a shortage +of huge pages at mmap() time. The idea is that if there were not enough +huge pages to cover the mapping, the mmap() would fail. This was first +done with a simple check in the code at mmap() time to determine if there +were enough free huge pages to cover the mapping. Like most things in the +kernel, the code has evolved over time. However, the basic idea was to +'reserve' huge pages at mmap() time to ensure that huge pages would be +available for page faults in that mapping. The description below attempts to +describe how huge page reserve processing is done in the v4.10 kernel. + + +Audience +======== +This description is primarily targeted at kernel developers who are modifying +hugetlbfs code. + + +The Data Structures +=================== + +resv_huge_pages + This is a global (per-hstate) count of reserved huge pages. Reserved + huge pages are only available to the task which reserved them. + Therefore, the number of huge pages generally available is computed + as (``free_huge_pages - resv_huge_pages``). +Reserve Map + A reserve map is described by the structure:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + There is one reserve map for each huge page mapping in the system. + The regions list within the resv_map describes the regions within + the mapping. A region is described as:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + The 'from' and 'to' fields of the file region structure are huge page + indices into the mapping. Depending on the type of mapping, a + region in the reserv_map may indicate reservations exist for the + range, or reservations do not exist. +Flags for MAP_PRIVATE Reservations + These are stored in the bottom bits of the reservation map pointer. + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + Indicates this task is the owner of the reservations + associated with the mapping. + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + Indicates task originally mapping this range (and creating + reserves) has unmapped a page from this task (the child) + due to a failed COW. +Page Flags + The PagePrivate page flag is used to indicate that a huge page + reservation must be restored when the huge page is freed. More + details will be discussed in the "Freeing huge pages" section. + + +Reservation Map Location (Private or Shared) +============================================ + +A huge page mapping or segment is either private or shared. If private, +it is typically only available to a single address space (task). If shared, +it can be mapped into multiple address spaces (tasks). The location and +semantics of the reservation map is significantly different for the two types +of mappings. Location differences are: + +- For private mappings, the reservation map hangs off the VMA structure. + Specifically, vma->vm_private_data. This reserve map is created at the + time the mapping (mmap(MAP_PRIVATE)) is created. +- For shared mappings, the reservation map hangs off the inode. Specifically, + inode->i_mapping->private_data. Since shared mappings are always backed + by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode + contains a reservation map. As a result, the reservation map is allocated + when the inode is created. + + +Creating Reservations +===================== +Reservations are created when a huge page backed shared memory segment is +created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). +These operations result in a call to the routine hugetlb_reserve_pages():: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +The first thing hugetlb_reserve_pages() does is check if the NORESERVE +flag was specified in either the shmget() or mmap() call. If NORESERVE +was specified, then this routine returns immediately as no reservations +are desired. + +The arguments 'from' and 'to' are huge page indices into the mapping or +underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to +the length of the segment/mapping. For mmap(), the offset argument could +be used to specify the offset into the underlying file. In such a case, +the 'from' and 'to' arguments have been adjusted by this offset. + +One of the big differences between PRIVATE and SHARED mappings is the way +in which reservations are represented in the reservation map. + +- For shared mappings, an entry in the reservation map indicates a reservation + exists or did exist for the corresponding page. As reservations are + consumed, the reservation map is not modified. +- For private mappings, the lack of an entry in the reservation map indicates + a reservation exists for the corresponding page. As reservations are + consumed, entries are added to the reservation map. Therefore, the + reservation map can also be used to determine which reservations have + been consumed. + +For private mappings, hugetlb_reserve_pages() creates the reservation map and +hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set +to indicate this VMA owns the reservations. + +The reservation map is consulted to determine how many huge page reservations +are needed for the current mapping/segment. For private mappings, this is +always the value (to - from). However, for shared mappings it is possible that +some reservations may already exist within the range (to - from). See the +section :ref:`Reservation Map Modifications ` +for details on how this is accomplished. + +The mapping may be associated with a subpool. If so, the subpool is consulted +to ensure there is sufficient space for the mapping. It is possible that the +subpool has set aside reservations that can be used for the mapping. See the +section :ref:`Subpool Reservations ` for more details. + +After consulting the reservation map and subpool, the number of needed new +reservations is known. The routine hugetlb_acct_memory() is called to check +for and take the requested number of reservations. hugetlb_acct_memory() +calls into routines that potentially allocate and adjust surplus page counts. +However, within those routines the code is simply checking to ensure there +are enough free huge pages to accommodate the reservation. If there are, +the global reservation count resv_huge_pages is adjusted something like the +following:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +Note that the global lock hugetlb_lock is held when checking and adjusting +these counters. + +If there were enough free huge pages and the global count resv_huge_pages +was adjusted, then the reservation map associated with the mapping is +modified to reflect the reservations. In the case of a shared mapping, a +file_region will exist that includes the range 'from' - 'to'. For private +mappings, no modifications are made to the reservation map as lack of an +entry indicates a reservation exists. + +If hugetlb_reserve_pages() was successful, the global reservation count and +reservation map associated with the mapping will be modified as required to +ensure reservations exist for the range 'from' - 'to'. + +.. _consume_resv: + +Consuming Reservations/Allocating a Huge Page +============================================= + +Reservations are consumed when huge pages associated with the reservations +are allocated and instantiated in the corresponding mapping. The allocation +is performed within the routine alloc_huge_page():: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page is passed a VMA pointer and a virtual address, so it can +consult the reservation map to determine if a reservation exists. In addition, +alloc_huge_page takes the argument avoid_reserve which indicates reserves +should not be used even if it appears they have been set aside for the +specified address. The avoid_reserve argument is most often used in the case +of Copy on Write and Page Migration where additional copies of an existing +page are being allocated. + +The helper routine vma_needs_reservation() is called to determine if a +reservation exists for the address within the mapping(vma). See the section +:ref:`Reservation Map Helper Routines ` for detailed +information on what this routine does. +The value returned from vma_needs_reservation() is generally +0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. +If a reservation does not exist, and there is a subpool associated with the +mapping the subpool is consulted to determine if it contains reservations. +If the subpool contains reservations, one can be used for this allocation. +However, in every case the avoid_reserve argument overrides the use of +a reservation for the allocation. After determining whether a reservation +exists and can be used for the allocation, the routine dequeue_huge_page_vma() +is called. This routine takes two arguments related to reservations: + +- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- chg, even though this argument is of type long only the values 0 or 1 are + passed to dequeue_huge_page_vma. If the value is 0, it indicates a + reservation exists (see the section "Memory Policy and Reservations" for + possible issues). If the value is 1, it indicates a reservation does not + exist and the page must be taken from the global free pool if possible. + +The free lists associated with the memory policy of the VMA are searched for +a free page. If a page is found, the value free_huge_pages is decremented +when the page is removed from the free list. If there was a reservation +associated with the page, the following adjustments are made:: + + SetPagePrivate(page); /* Indicates allocating this page consumed + * a reservation, and if an error is + * encountered such that the page must be + * freed, the reservation will be restored. */ + resv_huge_pages--; /* Decrement the global reservation count */ + +Note, if no huge page can be found that satisfies the VMA's memory policy +an attempt will be made to allocate one using the buddy allocator. This +brings up the issue of surplus huge pages and overcommit which is beyond +the scope reservations. Even if a surplus page is allocated, the same +reservation based adjustments as above will be made: SetPagePrivate(page) and +resv_huge_pages--. + +After obtaining a new huge page, (page)->private is set to the value of +the subpool associated with the page if it exists. This will be used for +subpool accounting when the page is freed. + +The routine vma_commit_reservation() is then called to adjust the reserve +map based on the consumption of the reservation. In general, this involves +ensuring the page is represented within a file_region structure of the region +map. For shared mappings where the reservation was present, an entry +in the reserve map already existed so no change is made. However, if there +was no reservation in a shared mapping or this was a private mapping a new +entry must be created. + +It is possible that the reserve map could have been changed between the call +to vma_needs_reservation() at the beginning of alloc_huge_page() and the +call to vma_commit_reservation() after the page was allocated. This would +be possible if hugetlb_reserve_pages was called for the same page in a shared +mapping. In such cases, the reservation count and subpool free page count +will be off by one. This rare condition can be identified by comparing the +return value from vma_needs_reservation and vma_commit_reservation. If such +a race is detected, the subpool and global reserve counts are adjusted to +compensate. See the section +:ref:`Reservation Map Helper Routines ` for more +information on these routines. + + +Instantiate Huge Pages +====================== + +After huge page allocation, the page is typically added to the page tables +of the allocating task. Before this, pages in a shared mapping are added +to the page cache and pages in private mappings are added to an anonymous +reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, +when a huge page that has been instantiated is freed no adjustment is made +to the global reservation count (resv_huge_pages). + + +Freeing Huge Pages +================== + +Huge page freeing is performed by the routine free_huge_page(). This routine +is the destructor for hugetlbfs compound pages. As a result, it is only +passed a pointer to the page struct. When a huge page is freed, reservation +accounting may need to be performed. This would be the case if the page was +associated with a subpool that contained reserves, or the page is being freed +on an error path where a global reserve count must be restored. + +The page->private field points to any subpool associated with the page. +If the PagePrivate flag is set, it indicates the global reserve count should +be adjusted (see the section +:ref:`Consuming Reservations/Allocating a Huge Page ` +for information on how these are set). + +The routine first calls hugepage_subpool_put_pages() for the page. If this +routine returns a value of 0 (which does not equal the value passed 1) it +indicates reserves are associated with the subpool, and this newly free page +must be used to keep the number of subpool reserves above the minimum size. +Therefore, the global resv_huge_pages counter is incremented in this case. + +If the PagePrivate flag was set in the page, the global resv_huge_pages counter +will always be incremented. + +.. _sub_pool_resv: + +Subpool Reservations +==================== + +There is a struct hstate associated with each huge page size. The hstate +tracks all huge pages of the specified size. A subpool represents a subset +of pages within a hstate that is associated with a mounted hugetlbfs +filesystem. + +When a hugetlbfs filesystem is mounted a min_size option can be specified +which indicates the minimum number of huge pages required by the filesystem. +If this option is specified, the number of huge pages corresponding to +min_size are reserved for use by the filesystem. This number is tracked in +the min_hpages field of a struct hugepage_subpool. At mount time, +hugetlb_acct_memory(min_hpages) is called to reserve the specified number of +huge pages. If they can not be reserved, the mount fails. + +The routines hugepage_subpool_get/put_pages() are called when pages are +obtained from or released back to a subpool. They perform all subpool +accounting, and track any reservations associated with the subpool. +hugepage_subpool_get/put_pages are passed the number of huge pages by which +to adjust the subpool 'used page' count (down for get, up for put). Normally, +they return the same value that was passed or an error if not enough pages +exist in the subpool. + +However, if reserves are associated with the subpool a return value less +than the passed value may be returned. This return value indicates the +number of additional global pool adjustments which must be made. For example, +suppose a subpool contains 3 reserved huge pages and someone asks for 5. +The 3 reserved pages associated with the subpool can be used to satisfy part +of the request. But, 2 pages must be obtained from the global pools. To +relay this information to the caller, the value 2 is returned. The caller +is then responsible for attempting to obtain the additional two pages from +the global pools. + + +COW and Reservations +==================== + +Since shared mappings all point to and use the same underlying pages, the +biggest reservation concern for COW is private mappings. In this case, +two tasks can be pointing at the same previously allocated page. One task +attempts to write to the page, so a new page must be allocated so that each +task points to its own page. + +When the page was originally allocated, the reservation for that page was +consumed. When an attempt to allocate a new page is made as a result of +COW, it is possible that no free huge pages are free and the allocation +will fail. + +When the private mapping was originally created, the owner of the mapping +was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation +map of the owner. Since the owner created the mapping, the owner owns all +the reservations associated with the mapping. Therefore, when a write fault +occurs and there is no page available, different action is taken for the owner +and non-owner of the reservation. + +In the case where the faulting task is not the owner, the fault will fail and +the task will typically receive a SIGBUS. + +If the owner is the faulting task, we want it to succeed since it owned the +original reservation. To accomplish this, the page is unmapped from the +non-owning task. In this way, the only reference is from the owning task. +In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer +of the non-owning task. The non-owning task may receive a SIGBUS if it later +faults on a non-present page. But, the original owner of the +mapping/reservation will behave as expected. + + +.. _resv_map_modifications: + +Reservation Map Modifications +============================= + +The following low level routines are used to make modifications to a +reservation map. Typically, these routines are not called directly. Rather, +a reservation map helper routine is called which calls one of these low level +routines. These low level routines are fairly well documented in the source +code (mm/hugetlb.c). These routines are:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +Operations on the reservation map typically involve two operations: + +1) region_chg() is called to examine the reserve map and determine how + many pages in the specified range [f, t) are NOT currently represented. + + The calling code performs global checks and allocations to determine if + there are enough huge pages for the operation to succeed. + +2) + a) If the operation can succeed, region_add() is called to actually modify + the reservation map for the same range [f, t) previously passed to + region_chg(). + b) If the operation can not succeed, region_abort is called for the same + range [f, t) to abort the operation. + +Note that this is a two step process where region_add() and region_abort() +are guaranteed to succeed after a prior call to region_chg() for the same +range. region_chg() is responsible for pre-allocating any data structures +necessary to ensure the subsequent operations (specifically region_add())) +will succeed. + +As mentioned above, region_chg() determines the number of pages in the range +which are NOT currently represented in the map. This number is returned to +the caller. region_add() returns the number of pages in the range added to +the map. In most cases, the return value of region_add() is the same as the +return value of region_chg(). However, in the case of shared mappings it is +possible for changes to the reservation map to be made between the calls to +region_chg() and region_add(). In this case, the return value of region_add() +will not match the return value of region_chg(). It is likely that in such +cases global counts and subpool accounting will be incorrect and in need of +adjustment. It is the responsibility of the caller to check for this condition +and make the appropriate adjustments. + +The routine region_del() is called to remove regions from a reservation map. +It is typically called in the following situations: + +- When a file in the hugetlbfs filesystem is being removed, the inode will + be released and the reservation map freed. Before freeing the reservation + map, all the individual file_region structures must be freed. In this case + region_del is passed the range [0, LONG_MAX). +- When a hugetlbfs file is being truncated. In this case, all allocated pages + after the new file size must be freed. In addition, any file_region entries + in the reservation map past the new end of file must be deleted. In this + case, region_del is passed the range [new_end_of_file, LONG_MAX). +- When a hole is being punched in a hugetlbfs file. In this case, huge pages + are removed from the middle of the file one at a time. As the pages are + removed, region_del() is called to remove the corresponding entry from the + reservation map. In this case, region_del is passed the range + [page_idx, page_idx + 1). + +In every case, region_del() will return the number of pages removed from the +reservation map. In VERY rare cases, region_del() can fail. This can only +happen in the hole punch case where it has to split an existing file_region +entry and can not allocate a new structure. In this error case, region_del() +will return -ENOMEM. The problem here is that the reservation map will +indicate that there is a reservation for the page. However, the subpool and +global reservation counts will not reflect the reservation. To handle this +situation, the routine hugetlb_fix_reserve_counts() is called to adjust the +counters so that they correspond with the reservation map entry that could +not be deleted. + +region_count() is called when unmapping a private huge page mapping. In +private mappings, the lack of a entry in the reservation map indicates that +a reservation exists. Therefore, by counting the number of entries in the +reservation map we know how many reservations were consumed and how many are +outstanding (outstanding = (end - start) - region_count(resv, start, end)). +Since the mapping is going away, the subpool and global reservation counts +are decremented by the number of outstanding reservations. + +.. _resv_map_helpers: + +Reservation Map Helper Routines +=============================== + +Several helper routines exist to query and modify the reservation maps. +These routines are only interested with reservations for a specific huge +page, so they just pass in an address instead of a range. In addition, +they pass in the associated VMA. From the VMA, the type of mapping (private +or shared) and the location of the reservation map (inode or VMA) can be +determined. These routines simply call the underlying routines described +in the section "Reservation Map Modifications". However, they do take into +account the 'opposite' meaning of reservation map entries for private and +shared mappings and hide this detail from the caller:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This routine calls region_chg() for the specified page. If no reservation +exists, 1 is returned. If a reservation exists, 0 is returned:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_add() for the specified page. As in the case of region_chg +and region_add, this routine is to be called after a previous call to +vma_needs_reservation. It will add a reservation entry for the page. It +returns 1 if the reservation was added and 0 if not. The return value should +be compared with the return value of the previous call to +vma_needs_reservation. An unexpected difference indicates the reservation +map was modified between calls:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This calls region_abort() for the specified page. As in the case of region_chg +and region_abort, this routine is to be called after a previous call to +vma_needs_reservation. It will abort/end the in progress reservation add +operation:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +This is a special wrapper routine to help facilitate reservation cleanup +on error paths. It is only called from the routine restore_reserve_on_error(). +This routine is used in conjunction with vma_needs_reservation in an attempt +to add a reservation to the reservation map. It takes into account the +different reservation map semantics for private and shared mappings. Hence, +region_add is called for shared mappings (as an entry present in the map +indicates a reservation), and region_del is called for private mappings (as +the absence of an entry in the map indicates a reservation). See the section +"Reservation cleanup in error paths" for more information on what needs to +be done on error paths. + + +Reservation Cleanup in Error Paths +================================== + +As mentioned in the section +:ref:`Reservation Map Helper Routines `, reservation +map modifications are performed in two steps. First vma_needs_reservation +is called before a page is allocated. If the allocation is successful, +then vma_commit_reservation is called. If not, vma_end_reservation is called. +Global and subpool reservation counts are adjusted based on success or failure +of the operation and all is well. + +Additionally, after a huge page is instantiated the PagePrivate flag is +cleared so that accounting when the page is ultimately freed is correct. + +However, there are several instances where errors are encountered after a huge +page is allocated but before it is instantiated. In this case, the page +allocation has consumed the reservation and made the appropriate subpool, +reservation map and global count adjustments. If the page is freed at this +time (before instantiation and clearing of PagePrivate), then free_huge_page +will increment the global reservation count. However, the reservation map +indicates the reservation was consumed. This resulting inconsistent state +will cause the 'leak' of a reserved huge page. The global reserve count will +be higher than it should and prevent allocation of a pre-allocated page. + +The routine restore_reserve_on_error() attempts to handle this situation. It +is fairly well documented. The intention of this routine is to restore +the reservation map to the way it was before the page allocation. In this +way, the state of the reservation map will correspond to the global reservation +count after the page is freed. + +The routine restore_reserve_on_error itself may encounter errors while +attempting to restore the reservation map entry. In this case, it will +simply clear the PagePrivate flag of the page. In this way, the global +reserve count will not be incremented when the page is freed. However, the +reservation map will continue to look as though the reservation was consumed. +A page can still be allocated for the address, but it will not use a reserved +page as originally intended. + +There is some code (most notably userfaultfd) which can not call +restore_reserve_on_error. In this case, it simply modifies the PagePrivate +so that a reservation will not be leaked when the huge page is freed. + + +Reservations and Memory Policy +============================== +Per-node huge page lists existed in struct hstate when git was first used +to manage Linux code. The concept of reservations was added some time later. +When reservations were added, no attempt was made to take memory policy +into account. While cpusets are not exactly the same as memory policy, this +comment in hugetlb_acct_memory sums up the interaction between reservations +and cpusets/memory policy:: + + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + +Huge page reservations were added to prevent unexpected page allocation +failures (OOM) at page fault time. However, if an application makes use +of cpusets or memory policy there is no guarantee that huge pages will be +available on the required nodes. This is true even if there are a sufficient +number of global reservations. + +Hugetlbfs regression testing +============================ + +The most complete set of hugetlb tests are in the libhugetlbfs repository. +If you modify any hugetlb related code, use the libhugetlbfs test suite +to check for regressions. In addition, if you add any new hugetlb +functionality, please add appropriate tests to libhugetlbfs. + +-- +Mike Kravetz, 7 April 2017 diff --git a/Documentation/mm/hwpoison.rst b/Documentation/mm/hwpoison.rst new file mode 100644 index 000000000000..b9d5253c1305 --- /dev/null +++ b/Documentation/mm/hwpoison.rst @@ -0,0 +1,184 @@ +.. hwpoison: + +======== +hwpoison +======== + +What is hwpoison? +================= + +Upcoming Intel CPUs have support for recovering from some memory errors +(``MCA recovery``). This requires the OS to declare a page "poisoned", +kill the processes associated with it and avoid using it in the future. + +This patchkit implements the necessary infrastructure in the VM. + +To quote the overview comment:: + + High level machine check handler. Handles pages reported by the + hardware as being corrupted usually due to a 2bit ECC memory or cache + failure. + + This focusses on pages detected as corrupted in the background. + When the current CPU tries to consume corruption the currently + running process can just be killed directly instead. This implies + that if the error cannot be handled for some reason it's safe to + just ignore it because no corruption has been consumed yet. Instead + when that happens another machine check will happen. + + Handles page cache pages in various states. The tricky part + here is that we can access any page asynchronous to other VM + users, because memory failures could happen anytime and anywhere, + possibly violating some of their assumptions. This is why this code + has to be extremely careful. Generally it tries to use normal locking + rules, as in get the standard locks, even if that means the + error handling takes potentially a long time. + + Some of the operations here are somewhat inefficient and have non + linear algorithmic complexity, because the data structures have not + been optimized for this case. This is in particular the case + for the mapping from a vma to a process. Since this case is expected + to be rare we hope we can get away with this. + +The code consists of a the high level handler in mm/memory-failure.c, +a new page poison bit and various checks in the VM to handle poisoned +pages. + +The main target right now is KVM guests, but it works for all kinds +of applications. KVM support requires a recent qemu-kvm release. + +For the KVM use there was need for a new signal type so that +KVM can inject the machine check into the guest with the proper +address. This in theory allows other applications to handle +memory failures too. The expection is that near all applications +won't do that, but some very specialized ones might. + +Failure recovery modes +====================== + +There are two (actually three) modes memory failure recovery can be in: + +vm.memory_failure_recovery sysctl set to zero: + All memory failures cause a panic. Do not attempt recovery. + +early kill + (can be controlled globally and per process) + Send SIGBUS to the application as soon as the error is detected + This allows applications who can process memory errors in a gentle + way (e.g. drop affected object) + This is the mode used by KVM qemu. + +late kill + Send SIGBUS when the application runs into the corrupted page. + This is best for memory error unaware applications and default + Note some pages are always handled as late kill. + +User control +============ + +vm.memory_failure_recovery + See sysctl.txt + +vm.memory_failure_early_kill + Enable early kill mode globally + +PR_MCE_KILL + Set early/late kill mode/revert to system default + + arg1: PR_MCE_KILL_CLEAR: + Revert to system default + arg1: PR_MCE_KILL_SET: + arg2 defines thread specific mode + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + Use system global default + + Note that if you want to have a dedicated thread which handles + the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should + call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, + the SIGBUS is sent to the main thread. + +PR_MCE_KILL_GET + return current mode + +Testing +======= + +* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the + process for testing + +* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` + + corrupt-pfn + Inject hwpoison fault at PFN echoed into this file. This does + some early filtering to avoid corrupted unintended pages in test suites. + + unpoison-pfn + Software-unpoison page at PFN echoed into this file. This way + a page can be reused again. This only works for Linux + injected failures, not for real memory failures. Once any hardware + memory failure happens, this feature is disabled. + + Note these injection interfaces are not stable and might change between + kernel versions + + corrupt-filter-dev-major, corrupt-filter-dev-minor + Only handle memory failures to pages associated with the file + system defined by block device major/minor. -1U is the + wildcard value. This should be only used for testing with + artificial injection. + + corrupt-filter-memcg + Limit injection to pages owned by memgroup. Specified by inode + number of the memcg. + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + When specified, only poison pages if ((page_flags & mask) == + value). This allows stress testing of many kinds of + pages. The page_flags are the same as in /proc/kpageflags. The + flag bits are defined in include/linux/kernel-page-flags.h and + documented in Documentation/admin-guide/mm/pagemap.rst + +* Architecture specific MCE injector + + x86 has mce-inject, mce-test + + Some portable hwpoison test programs in mce-test, see below. + +References +========== + +http://halobates.de/mce-lc09-2.pdf + Overview presentation from LinuxCon 09 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + Test suite (hwpoison specific portable tests in tsrc) + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86 specific injector + + +Limitations +=========== +- Not all page types are supported and never will. Most kernel internal + objects cannot be recovered, only LRU pages for now. + +--- +Andi Kleen, Oct 2009 diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst new file mode 100644 index 000000000000..575ccd40e30c --- /dev/null +++ b/Documentation/mm/index.rst @@ -0,0 +1,68 @@ +===================================== +Linux Memory Management Documentation +===================================== + +Memory Management Guide +======================= + +This is a guide to understanding the memory management subsystem +of Linux. If you are looking for advice on simply allocating memory, +see the :ref:`memory_allocation`. For controlling and tuning guides, +see the :doc:`admin guide <../admin-guide/mm/index>`. + +.. toctree:: + :maxdepth: 1 + + physical_memory + page_tables + process_addrs + bootmem + page_allocation + vmalloc + slab + highmem + page_reclaim + swap + page_cache + shmfs + oom + +Legacy Documentation +==================== + +This is a collection of older documents about the Linux memory management +(MM) subsystem internals with different level of details ranging from +notes and mailing list responses for elaborating descriptions of data +structures and algorithms. It should all be integrated nicely into the +above structured documentation, or deleted if it has served its purpose. + +.. toctree:: + :maxdepth: 1 + + active_mm + arch_pgtable_helpers + balance + damon/index + free_page_reporting + frontswap + hmm + hwpoison + hugetlbfs_reserv + ksm + memory-model + mmu_notifier + numa + overcommit-accounting + page_migration + page_frags + page_owner + page_table_check + remap_file_pages + slub + split_page_table_lock + transhuge + unevictable-lru + vmalloced-kernel-stacks + vmemmap_dedup + z3fold + zsmalloc diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst new file mode 100644 index 000000000000..9e37add068e6 --- /dev/null +++ b/Documentation/mm/ksm.rst @@ -0,0 +1,87 @@ +.. _ksm: + +======================= +Kernel Samepage Merging +======================= + +KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, +added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, +and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/ + +The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst ` + +Design +====== + +Overview +-------- + +.. kernel-doc:: mm/ksm.c + :DOC: Overview + +Reverse mapping +--------------- +KSM maintains reverse mapping information for KSM pages in the stable +tree. + +If a KSM page is shared between less than ``max_page_sharing`` VMAs, +the node of the stable tree that represents such KSM page points to a +list of struct rmap_item and the ``page->mapping`` of the +KSM page points to the stable tree node. + +When the sharing passes this threshold, KSM adds a second dimension to +the stable tree. The tree node becomes a "chain" that links one or +more "dups". Each "dup" keeps reverse mapping information for a KSM +page with ``page->mapping`` pointing to that "dup". + +Every "chain" and all "dups" linked into a "chain" enforce the +invariant that they represent the same write protected memory content, +even if each "dup" will be pointed by a different KSM page copy of +that content. + +This way the stable tree lookup computational complexity is unaffected +if compared to an unlimited list of reverse mappings. It is still +enforced that there cannot be KSM page content duplicates in the +stable tree itself. + +The deduplication limit enforced by ``max_page_sharing`` is required +to avoid the virtual memory rmap lists to grow too large. The rmap +walk has O(N) complexity where N is the number of rmap_items +(i.e. virtual mappings) that are sharing the page, which is in turn +capped by ``max_page_sharing``. So this effectively spreads the linear +O(N) computational complexity from rmap walk context over different +KSM pages. The ksmd walk over the stable_node "chains" is also O(N), +but N is the number of stable_node "dups", not the number of +rmap_items, so it has not a significant impact on ksmd performance. In +practice the best stable_node "dup" candidate will be kept and found +at the head of the "dups" list. + +High values of ``max_page_sharing`` result in faster memory merging +(because there will be fewer stable_node dups queued into the +stable_node chain->hlist to check for pruning) and higher +deduplication factor at the expense of slower worst case for rmap +walks for any KSM page which can happen during swapping, compaction, +NUMA balancing and page migration. + +The ``stable_node_dups/stable_node_chains`` ratio is also affected by the +``max_page_sharing`` tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to free up +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + +The whole list of stable_node "dups" linked in the stable_node +"chains" is scanned periodically in order to prune stale stable_nodes. +The frequency of such scans is defined by +``stable_node_chains_prune_millisecs`` sysfs tunable. + +Reference +--------- +.. kernel-doc:: mm/ksm.c + :functions: mm_slot ksm_scan stable_node rmap_item + +-- +Izik Eidus, +Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst new file mode 100644 index 000000000000..3779e562dc76 --- /dev/null +++ b/Documentation/mm/memory-model.rst @@ -0,0 +1,177 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _physical_memory_model: + +===================== +Physical Memory Model +===================== + +Physical memory in a system may be addressed in different ways. The +simplest case is when the physical memory starts at address 0 and +spans a contiguous range up to the maximal address. It could be, +however, that this range contains small holes that are not accessible +for the CPU. Then there could be several contiguous ranges at +completely distinct addresses. And, don't forget about NUMA, where +different memory banks are attached to different CPUs. + +Linux abstracts this diversity using one of the two memory models: +FLATMEM and SPARSEMEM. Each architecture defines what +memory models it supports, what the default memory model is and +whether it is possible to manually override that default. + +All the memory models track the status of physical page frames using +struct page arranged in one or more arrays. + +Regardless of the selected memory model, there exists one-to-one +mapping between the physical page frame number (PFN) and the +corresponding `struct page`. + +Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn` +helpers that allow the conversion from PFN to `struct page` and vice +versa. + +FLATMEM +======= + +The simplest memory model is FLATMEM. This model is suitable for +non-NUMA systems with contiguous, or mostly contiguous, physical +memory. + +In the FLATMEM memory model, there is a global `mem_map` array that +maps the entire physical memory. For most architectures, the holes +have entries in the `mem_map` array. The `struct page` objects +corresponding to the holes are never fully initialized. + +To allocate the `mem_map` array, architecture specific setup code should +call :c:func:`free_area_init` function. Yet, the mappings array is not +usable until the call to :c:func:`memblock_free_all` that hands all the +memory to the page allocator. + +An architecture may free parts of the `mem_map` array that do not cover the +actual physical pages. In such case, the architecture specific +:c:func:`pfn_valid` implementation should take the holes in the +`mem_map` into account. + +With FLATMEM, the conversion between a PFN and the `struct page` is +straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the +`mem_map` array. + +The `ARCH_PFN_OFFSET` defines the first page frame number for +systems with physical memory starting at address different from 0. + +SPARSEMEM +========= + +SPARSEMEM is the most versatile memory model available in Linux and it +is the only memory model that supports several advanced features such +as hot-plug and hot-remove of the physical memory, alternative memory +maps for non-volatile memory devices and deferred initialization of +the memory map for larger systems. + +The SPARSEMEM model presents the physical memory as a collection of +sections. A section is represented with struct mem_section +that contains `section_mem_map` that is, logically, a pointer to an +array of struct pages. However, it is stored with some other magic +that aids the sections management. The section size and maximal number +of section is specified using `SECTION_SIZE_BITS` and +`MAX_PHYSMEM_BITS` constants defined by each architecture that +supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a +physical address that an architecture supports, the +`SECTION_SIZE_BITS` is an arbitrary value. + +The maximal number of sections is denoted `NR_MEM_SECTIONS` and +defined as + +.. math:: + + NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} + +The `mem_section` objects are arranged in a two-dimensional array +called `mem_sections`. The size and placement of this array depend +on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of +sections: + +* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections` + array is static and has `NR_MEM_SECTIONS` rows. Each row holds a + single `mem_section` object. +* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections` + array is dynamically allocated. Each row contains PAGE_SIZE worth of + `mem_section` objects and the number of rows is calculated to fit + all the memory sections. + +The architecture setup code should call sparse_init() to +initialize the memory sections and the memory maps. + +With SPARSEMEM there are two possible ways to convert a PFN to the +corresponding `struct page` - a "classic sparse" and "sparse +vmemmap". The selection is made at build time and it is determined by +the value of `CONFIG_SPARSEMEM_VMEMMAP`. + +The classic sparse encodes the section number of a page in page->flags +and uses high bits of a PFN to access the section that maps that page +frame. Inside a section, the PFN is the index to the array of pages. + +The sparse vmemmap uses a virtually mapped memory map to optimize +pfn_to_page and page_to_pfn operations. There is a global `struct +page *vmemmap` pointer that points to a virtually contiguous array of +`struct page` objects. A PFN is an index to that array and the +offset of the `struct page` from `vmemmap` is the PFN of that +page. + +To use vmemmap, an architecture has to reserve a range of virtual +addresses that will map the physical pages containing the memory +map and make sure that `vmemmap` points to that range. In addition, +the architecture should implement :c:func:`vmemmap_populate` method +that will allocate the physical memory and create page tables for the +virtual memory map. If an architecture does not have any special +requirements for the vmemmap mappings, it can use default +:c:func:`vmemmap_populate_basepages` provided by the generic memory +management. + +The virtually mapped memory map allows storing `struct page` objects +for persistent memory devices in pre-allocated storage on those +devices. This storage is represented with struct vmem_altmap +that is eventually passed to vmemmap_populate() through a long chain +of function calls. The vmemmap_populate() implementation may use the +`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to +allocate memory map on the persistent memory device. + +ZONE_DEVICE +=========== +The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer +`struct page` `mem_map` services for device driver identified physical +address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact +that the page objects for these address ranges are never marked online, +and that a reference must be taken against the device, not just the page +to keep the memory pinned for active use. `ZONE_DEVICE`, via +:c:func:`devm_memremap_pages`, performs just enough memory hotplug to +turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and +:c:func:`get_user_pages` service for the given range of pfns. Since the +page reference count never drops below 1 the page is never tracked as +free memory and the page's `struct list_head lru` space is repurposed +for back referencing to the host device / driver that mapped the memory. + +While `SPARSEMEM` presents memory as a collection of sections, +optionally collected into memory blocks, `ZONE_DEVICE` users have a need +for smaller granularity of populating the `mem_map`. Given that +`ZONE_DEVICE` memory is never marked online it is subsequently never +subject to its memory ranges being exposed through the sysfs memory +hotplug api on memory block boundaries. The implementation relies on +this lack of user-api constraint to allow sub-section sized memory +ranges to be specified to :c:func:`arch_add_memory`, the top-half of +memory hotplug. Sub-section support allows for 2MB as the cross-arch +common alignment granularity for :c:func:`devm_memremap_pages`. + +The users of `ZONE_DEVICE` are: + +* pmem: Map platform persistent memory to be used as a direct-I/O target + via DAX mappings. + +* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` + event callbacks to allow a device-driver to coordinate memory management + events related to device-memory, typically GPU memory. See + Documentation/mm/hmm.rst. + +* p2pdma: Create `struct page` objects to allow peer devices in a + PCI/-E topology to coordinate direct-DMA operations between themselves, + i.e. bypass host memory. diff --git a/Documentation/mm/mmu_notifier.rst b/Documentation/mm/mmu_notifier.rst new file mode 100644 index 000000000000..df5d7777fc6b --- /dev/null +++ b/Documentation/mm/mmu_notifier.rst @@ -0,0 +1,99 @@ +.. _mmu_notifier: + +When do you need to notify inside page table lock ? +=================================================== + +When clearing a pte/pmd we are given a choice to notify the event through +(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under +the page table lock. But that notification is not necessary in all cases. + +For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use +thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a +process virtual address space). There is only 2 cases when you need to notify +those secondary TLB while holding page table lock when clearing a pte/pmd: + + A) page backing address is free before mmu_notifier_invalidate_range_end() + B) a page table entry is updated to point to a new page (COW, write fault + on zero page, __replace_page(), ...) + +Case A is obvious you do not want to take the risk for the device to write to +a page that might now be used by some completely different task. + +Case B is more subtle. For correctness it requires the following sequence to +happen: + + - take page table lock + - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) + - set page table entry to point to new page + +If clearing the page table entry is not followed by a notify before setting +the new pte/pmd value then you can break memory model like C11 or C++11 for +the device. + +Consider the following scenario (device use a feature similar to ATS/PASID): + +Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume +they are write protected for COW (other case of B apply too). + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {try to write to addrA} + CPU-thread-1 {try to write to addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA and populate device TLB} + DEV-thread-2 {read addrB and populate device TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} + CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {write to addrA which is a write to new page} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {write to addrB which is a write to new page} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {read addrA from old page} + DEV-thread-2 {read addrB from new page} + +So here because at time N+2 the clear page table entry was not pair with a +notification to invalidate the secondary TLB, the device see the new value for +addrB before seeing the new value for addrA. This break total memory ordering +for the device. + +When changing a pte to write protect or to point to a new write protected page +with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range +call to mmu_notifier_invalidate_range_end() outside the page table lock. This +is true even if the thread doing the page table update is preempted right after +releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/mm/numa.rst b/Documentation/mm/numa.rst new file mode 100644 index 000000000000..99fdeca917ca --- /dev/null +++ b/Documentation/mm/numa.rst @@ -0,0 +1,150 @@ +.. _numa: + +Started Nov 1999 by Kanoj Sarcar + +============= +What is NUMA? +============= + +This question can be answered from a couple of perspectives: the +hardware view and the Linux software view. + +From the hardware perspective, a NUMA system is a computer platform that +comprises multiple components or assemblies each of which may contain 0 +or more CPUs, local memory, and/or IO buses. For brevity and to +disambiguate the hardware view of these physical components/assemblies +from the software abstraction thereof, we'll call the components/assemblies +'cells' in this document. + +Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset +of the system--although some components necessary for a stand-alone SMP system +may not be populated on any given cell. The cells of the NUMA system are +connected together with some sort of system interconnect--e.g., a crossbar or +point-to-point link are common types of NUMA system interconnects. Both of +these types of interconnects can be aggregated to create NUMA platforms with +cells at multiple distances from other cells. + +For Linux, the NUMA platforms of interest are primarily what is known as Cache +Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible +to and accessible from any CPU attached to any cell and cache coherency +is handled in hardware by the processor caches and/or the system interconnect. + +Memory access time and effective memory bandwidth varies depending on how far +away the cell containing the CPU or IO bus making the memory access is from the +cell containing the target memory. For example, access to memory by CPUs +attached to the same cell will experience faster access times and higher +bandwidths than accesses to memory on other, remote cells. NUMA platforms +can have cells at multiple remote distances from any given cell. + +Platform vendors don't build NUMA systems just to make software developers' +lives interesting. Rather, this architecture is a means to provide scalable +memory bandwidth. However, to achieve scalable memory bandwidth, system and +application software must arrange for a large majority of the memory references +[cache misses] to be to "local" memory--memory on the same cell, if any--or +to the closest cell with memory. + +This leads to the Linux software view of a NUMA system: + +Linux divides the system's hardware resources into multiple software +abstractions called "nodes". Linux maps the nodes onto the physical cells +of the hardware platform, abstracting away some of the details for some +architectures. As with physical cells, software nodes may contain 0 or more +CPUs, memory and/or IO buses. And, again, memory accesses to memory on +"closer" nodes--nodes that map to closer cells--will generally experience +faster access times and higher effective bandwidth than accesses to more +remote cells. + +For some architectures, such as x86, Linux will "hide" any node representing a +physical cell that has no memory attached, and reassign any CPUs attached to +that cell to a node representing a cell that does have memory. Thus, on +these architectures, one cannot assume that all CPUs that Linux associates with +a given node will see the same local memory access times and bandwidth. + +In addition, for some architectures, again x86 is an example, Linux supports +the emulation of additional nodes. For NUMA emulation, linux will carve up +the existing nodes--or the system memory for non-NUMA platforms--into multiple +nodes. Each emulated node will manage a fraction of the underlying cells' +physical memory. NUMA emluation is useful for testing NUMA kernel and +application features on non-NUMA platforms, and as a sort of memory resource +management mechanism when used together with cpusets. +[see Documentation/admin-guide/cgroup-v1/cpusets.rst] + +For each node with memory, Linux constructs an independent memory management +subsystem, complete with its own free page lists, in-use page lists, usage +statistics and locks to mediate access. In addition, Linux constructs for +each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], +an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a +selected zone/node cannot satisfy the allocation request. This situation, +when a zone has no available memory to satisfy a request, is called +"overflow" or "fallback". + +Because some nodes contain multiple zones containing different types of +memory, Linux must decide whether to order the zonelists such that allocations +fall back to the same zone type on a different node, or to a different zone +type on the same node. This is an important consideration because some zones, +such as DMA or DMA32, represent relatively scarce resources. Linux chooses +a default Node ordered zonelist. This means it tries to fallback to other zones +from the same node before using remote nodes which are ordered by NUMA distance. + +By default, Linux will attempt to satisfy memory allocation requests from the +node to which the CPU that executes the request is assigned. Specifically, +Linux will attempt to allocate from the first node in the appropriate zonelist +for the node where the request originates. This is called "local allocation." +If the "local" node cannot satisfy the request, the kernel will examine other +nodes' zones in the selected zonelist looking for the first zone in the list +that can satisfy the request. + +Local allocation will tend to keep subsequent access to the allocated memory +"local" to the underlying physical resources and off the system interconnect-- +as long as the task on whose behalf the kernel allocated some memory does not +later migrate away from that memory. The Linux scheduler is aware of the +NUMA topology of the platform--embodied in the "scheduling domains" data +structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler +attempts to minimize task migration to distant scheduling domains. However, +the scheduler does not take a task's NUMA footprint into account directly. +Thus, under sufficient imbalance, tasks can migrate between nodes, remote +from their initial node and kernel data structures. + +System administrators and application designers can restrict a task's migration +to improve NUMA locality using various CPU affinity command line interfaces, +such as taskset(1) and numactl(1), and program interfaces such as +sched_setaffinity(2). Further, one can modify the kernel's default local +allocation behavior using Linux NUMA memory policy. [see +:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. + +System administrators can restrict the CPUs and nodes' memories that a non- +privileged user can specify in the scheduling or NUMA commands and functions +using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] + +On architectures that do not hide memoryless nodes, Linux will include only +zones [nodes] with memory in the zonelists. This means that for a memoryless +node the "local memory node"--the node of the first zone in CPU's node's +zonelist--will not be the node itself. Rather, it will be the node that the +kernel selected as the nearest node with memory when it built the zonelists. +So, default, local allocations will succeed with the kernel supplying the +closest available memory. This is a consequence of the same mechanism that +allows such allocations to fallback to other nearby nodes when a node that +does contain memory overflows. + +Some kernel allocations do not want or cannot tolerate this allocation fallback +behavior. Rather they want to be sure they get memory from the specified node +or get notified that the node has no free memory. This is usually the case when +a subsystem allocates per CPU memory resources, for example. + +A typical model for making such an allocation is to obtain the node id of the +node to which the "current CPU" is attached using one of the kernel's +numa_node_id() or CPU_to_node() functions and then request memory from only +the node id returned. When such an allocation fails, the requesting subsystem +may revert to its own fallback path. The slab kernel memory allocator is an +example of this. Or, the subsystem may choose to disable or not to enable +itself on allocation failure. The kernel profiling subsystem is an example of +this. + +If the architecture supports--does not hide--memoryless nodes, then CPUs +attached to memoryless nodes would always incur the fallback path overhead +or some subsystems would fail to initialize if they attempted to allocated +memory exclusively from a node without memory. To support such +architectures transparently, kernel subsystems can use the numa_mem_id() +or cpu_to_mem() function to locate the "local memory node" for the calling or +specified CPU. Again, this is the same node from which default, local page +allocations will be attempted. diff --git a/Documentation/mm/oom.rst b/Documentation/mm/oom.rst new file mode 100644 index 000000000000..18e9e40c1ec1 --- /dev/null +++ b/Documentation/mm/oom.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Out Of Memory Handling +====================== diff --git a/Documentation/mm/overcommit-accounting.rst b/Documentation/mm/overcommit-accounting.rst new file mode 100644 index 000000000000..1addb0c374a4 --- /dev/null +++ b/Documentation/mm/overcommit-accounting.rst @@ -0,0 +1,88 @@ +.. _overcommit_accounting: + +===================== +Overcommit Accounting +===================== + +The Linux kernel supports the following overcommit handling modes + +0 + Heuristic overcommit handling. Obvious overcommits of address + space are refused. Used for a typical system. It ensures a + seriously wild allocation fails while allowing overcommit to + reduce swap usage. root is allowed to allocate slightly more + memory in this mode. This is the default. + +1 + Always overcommit. Appropriate for some scientific + applications. Classic example is code using sparse arrays and + just relying on the virtual memory consisting almost entirely + of zero pages. + +2 + Don't overcommit. The total address space commit for the + system is not permitted to exceed swap + a configurable amount + (default is 50%) of physical RAM. Depending on the amount you + use, in most situations this means a process will not be + killed while accessing pages but will receive errors on memory + allocation as appropriate. + + Useful for applications that want to guarantee their memory + allocations will be available in the future without having to + initialize every page. + +The overcommit policy is set via the sysctl ``vm.overcommit_memory``. + +The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) +or ``vm.overcommit_kbytes`` (absolute value). These only have an effect +when ``vm.overcommit_memory`` is set to 2. + +The current overcommit limit and amount committed are viewable in +``/proc/meminfo`` as CommitLimit and Committed_AS respectively. + +Gotchas +======= + +The C language stack growth does an implicit mremap. If you want absolute +guarantees and run close to the edge you MUST mmap your stack for the +largest size you think you will need. For typical stack usage this does +not matter much but it's a corner case if you really really care + +In mode 2 the MAP_NORESERVE flag is ignored. + + +How It Works +============ + +The overcommit is based on the following rules + +For a file backed map + | SHARED or READ-only - 0 cost (the file is the map not swap) + | PRIVATE WRITABLE - size of mapping per instance + +For an anonymous or ``/dev/zero`` map + | SHARED - size of mapping + | PRIVATE READ-only - 0 cost (but of little use) + | PRIVATE WRITABLE - size of mapping per instance + +Additional accounting + | Pages made writable copies by mmap + | shmfs memory drawn from the same pool + +Status +====== + +* We account mmap memory mappings +* We account mprotect changes in commit +* We account mremap changes in size +* We account brk +* We account munmap +* We report the commit status in /proc +* Account and check on fork +* Review stack handling/building on exec +* SHMfs accounting +* Implement actual limit enforcement + +To Do +===== +* Account ptrace pages (this is hard) diff --git a/Documentation/mm/page_allocation.rst b/Documentation/mm/page_allocation.rst new file mode 100644 index 000000000000..d9b4495561f1 --- /dev/null +++ b/Documentation/mm/page_allocation.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Page Allocation +=============== diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst new file mode 100644 index 000000000000..75eba7c431b2 --- /dev/null +++ b/Documentation/mm/page_cache.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Page Cache +========== diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst new file mode 100644 index 000000000000..7d6f9385d129 --- /dev/null +++ b/Documentation/mm/page_frags.rst @@ -0,0 +1,45 @@ +.. _page_frags: + +============== +Page fragments +============== + +A page fragment is an arbitrary-length arbitrary-offset area of memory +which resides within a 0 or higher order compound page. Multiple +fragments within that page are individually refcounted, in the page's +reference counter. + +The page_frag functions, page_frag_alloc and page_frag_free, provide a +simple allocation framework for page fragments. This is used by the +network stack and network device drivers to provide a backing region of +memory for use as either an sk_buff->head, or to be used in the "frags" +portion of skb_shared_info. + +In order to make use of the page fragment APIs a backing page fragment +cache is needed. This provides a central point for the fragment allocation +and tracks allows multiple calls to make use of a cached page. The +advantage to doing this is that multiple calls to get_page can be avoided +which can be expensive at allocation time. However due to the nature of +this caching it is required that any calls to the cache be protected by +either a per-cpu limitation, or a per-cpu limitation and forcing interrupts +to be disabled when executing the fragment allocation. + +The network stack uses two separate caches per CPU to handle fragment +allocation. The netdev_alloc_cache is used by callers making use of the +netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is +used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +main difference between these two calls is the context in which they may be +called. The "netdev" prefixed functions are usable in any context as these +functions will disable interrupts, while the "napi" prefixed functions are +only usable within the softirq context. + +Many network device drivers use a similar methodology for allocating page +fragments, but the page fragments are cached at the ring or descriptor +level. In order to enable these cases it is necessary to provide a generic +way of tearing down a page cache. For this reason __page_frag_cache_drain +was implemented. It allows for freeing multiple references from a single +page via a single call. The advantage to doing this is that it allows for +cleaning up the multiple references that were added to a page in order to +avoid calling get_page per allocation. + +Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst new file mode 100644 index 000000000000..8c5cb8147e55 --- /dev/null +++ b/Documentation/mm/page_migration.rst @@ -0,0 +1,288 @@ +.. _page_migration: + +============== +Page migration +============== + +Page migration allows moving the physical location of pages between +nodes in a NUMA system while the process is running. This means that the +virtual addresses that the process sees do not change. However, the +system rearranges the physical location of those pages. + +Also see :ref:`Heterogeneous Memory Management (HMM) ` +for migrating pages to or from device private memory. + +The main intent of page migration is to reduce the latency of memory accesses +by moving pages near to the processor where the process accessing that memory +is running. + +Page migration allows a process to manually relocate the node on which its +pages are located through the MF_MOVE and MF_MOVE_ALL options while setting +a new memory policy via mbind(). The pages of a process can also be relocated +from another process using the sys_migrate_pages() function call. The +migrate_pages() function call takes two sets of nodes and moves pages of a +process that are located on the from nodes to the destination nodes. +Page migration functions are provided by the numactl package by Andi Kleen +(a version later than 0.9.3 is required. Get it from +https://github.com/numactl/numactl.git). numactl provides libnuma +which provides an interface similar to other NUMA functionality for page +migration. cat ``/proc//numa_maps`` allows an easy review of where the +pages of a process are located. See also the numa_maps documentation in the +proc(5) man page. + +Manual migration is useful if for example the scheduler has relocated +a process to a processor on a distant node. A batch scheduler or an +administrator may detect the situation and move the pages of the process +nearer to the new processor. The kernel itself only provides +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +For example, A NUMA profiler may obtain a log showing frequent off-node +accesses and may use the result to move pages to more advantageous +locations. + +Larger installations usually partition the system using cpusets into +sections of nodes. Paul Jackson has equipped cpusets with the ability to +move pages when a task is moved to another cpuset (See +:ref:`CPUSETS `). +Cpusets allow the automation of process locality. If a task is moved to +a new cpuset then also all its pages are moved with it so that the +performance of the process does not sink dramatically. Also the pages +of processes in a cpuset are moved if the allowed memory nodes of a +cpuset are changed. + +Page migration allows the preservation of the relative location of pages +within a group of nodes for all migration techniques which will preserve a +particular memory allocation pattern generated even after migrating a +process. This is necessary in order to preserve the memory latencies. +Processes will run with similar performance after migration. + +Page migration occurs in several steps. First a high level +description for those trying to use migrate_pages() from the kernel +(for userspace usage see the Andi Kleen's numactl package mentioned above) +and then a low level description of how the low level details work. + +In kernel use of migrate_pages() +================================ + +1. Remove pages from the LRU. + + Lists of pages to be migrated are generated by scanning over + pages and moving them into lists. This is done by + calling isolate_lru_page(). + Calling isolate_lru_page() increases the references to the page + so that it cannot vanish while the page migration occurs. + It also prevents the swapper or other scans from encountering + the page. + +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. + +3. The migrate_pages() function is called which attempts + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. + +How migrate_pages() works +========================= + +migrate_pages() does several passes over its list of pages. A page is moved +if all references to a page are removable at the time. The page has +already been removed from the LRU via isolate_lru_page() and the refcount +is increased so that the page cannot be freed while page migration occurs. + +Steps: + +1. Lock the page to be migrated. + +2. Ensure that writeback is complete. + +3. Lock the new page that we want to move to. It is locked so that accesses to + this (not yet up-to-date) page immediately block while the move is in progress. + +4. All the page table references to the page are converted to migration + entries. This decreases the mapcount of a page. If the resulting + mapcount is not zero then we do not migrate the page. All user space + processes that attempt to access the page will now wait on the page lock + or wait for the migration page table entry to be removed. + +5. The i_pages lock is taken. This will cause all processes trying + to access the page via the mapping to block on the spinlock. + +6. The refcount of the page is examined and we back out if references remain. + Otherwise, we know that we are the only one referencing this page. + +7. The radix tree is checked and if it does not contain the pointer to this + page then we back out because someone else modified the radix tree. + +8. The new page is prepped with some settings from the old page so that + accesses to the new page will discover a page with the correct settings. + +9. The radix tree is changed to point to the new page. + +10. The reference count of the old page is dropped because the address space + reference is gone. A reference to the new page is established because + the new page is referenced by the address space. + +11. The i_pages lock is dropped. With that lookups in the mapping + become possible again. Processes will move from spinning on the lock + to sleeping on the locked new page. + +12. The page contents are copied to the new page. + +13. The remaining page flags are copied to the new page. + +14. The old page flags are cleared to indicate that the page does + not provide any information anymore. + +15. Queued up writeback on the new page is triggered. + +16. If migration entries were inserted into the page table, then replace them + with real ptes. Doing so will enable access for user space processes not + already waiting for the page lock. + +17. The page locks are dropped from the old and new page. + Processes waiting on the page lock will redo their page faults + and will reach the new page. + +18. The new page is moved to the LRU and can be scanned by the swapper, + etc. again. + +Non-LRU page migration +====================== + +Although migration originally aimed for reducing the latency of memory accesses +for NUMA, compaction also uses migration to create high-order pages. + +Current problem of the implementation is that it is designed to migrate only +*LRU* pages. However, there are potential non-LRU pages which can be migrated +in drivers, for example, zsmalloc, virtio-balloon pages. + +For virtio-balloon pages, some parts of migration code path have been hooked +up and added virtio-balloon specific functions to intercept migration logics. +It's too specific to a driver so other drivers who want to make their pages +movable would have to add their own specific hooks in the migration path. + +To overcome the problem, VM supports non-LRU page migration which provides +generic functions for non-LRU movable pages without driver specific hooks +in the migration path. + +If a driver wants to make its pages movable, it should define three functions +which are function pointers of struct address_space_operations. + +1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` + + What VM expects from isolate_page() function of driver is to return *true* + if driver isolates the page successfully. On returning true, VM marks the page + as PG_isolated so concurrent isolation in several CPUs skip the page + for isolation. If a driver cannot isolate the page, it should return *false*. + + Once page is successfully isolated, VM uses page.lru fields so driver + shouldn't expect to preserve values in those fields. + +2. ``int (*migratepage) (struct address_space *mapping,`` +| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` + + After isolation, VM calls migratepage() of driver with the isolated page. + The function of migratepage() is to move the contents of the old page to the + new page + and set up fields of struct page newpage. Keep in mind that you should + indicate to the VM the oldpage is no longer movable via __ClearPageMovable() + under page_lock if you migrated the oldpage successfully and returned + MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver + can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time + because VM interprets -EAGAIN as "temporary migration failure". On returning + any error except -EAGAIN, VM will give up the page migration without + retrying. + + Driver shouldn't touch the page.lru field while in the migratepage() function. + +3. ``void (*putback_page)(struct page *);`` + + If migration fails on the isolated page, VM should return the isolated page + to the driver so VM calls the driver's putback_page() with the isolated page. + In this function, the driver should put the isolated page back into its own data + structure. + +Non-LRU movable page flags + + There are two page flags for supporting non-LRU movable page. + + * PG_movable + + Driver should use the function below to make page movable under page_lock:: + + void __SetPageMovable(struct page *page, struct address_space *mapping) + + It needs argument of address_space for registering migration + family functions which will be called by VM. Exactly speaking, + PG_movable is not a real flag of struct page. Rather, VM + reuses the page->mapping's lower bits to represent it:: + + #define PAGE_MAPPING_MOVABLE 0x2 + page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; + + so driver shouldn't access page->mapping directly. Instead, driver should + use page_mapping() which masks off the low two bits of page->mapping under + page lock so it can get the right struct address_space. + + For testing of non-LRU movable pages, VM supports __PageMovable() function. + However, it doesn't guarantee to identify non-LRU movable pages because + the page->mapping field is unified with other variables in struct page. + If the driver releases the page after isolation by VM, page->mapping + doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set + (look at __ClearPageMovable). But __PageMovable() is cheap to call whether + page is LRU or non-LRU movable once the page has been isolated because LRU + pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also + good for just peeking to test non-LRU movable pages before more expensive + checking with lock_page() in pfn scanning to select a victim. + + For guaranteeing non-LRU movable page, VM provides PageMovable() function. + Unlike __PageMovable(), PageMovable() validates page->mapping and + mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents + sudden destroying of page->mapping. + + Drivers using __SetPageMovable() should clear the flag via + __ClearMovablePage() under page_lock() before the releasing the page. + + * PG_isolated + + To prevent concurrent isolation among several CPUs, VM marks isolated page + as PG_isolated under lock_page(). So if a CPU encounters PG_isolated + non-LRU movable page, it can skip it. Driver doesn't need to manipulate the + flag because VM will set/clear it automatically. Keep in mind that if the + driver sees a PG_isolated page, it means the page has been isolated by the + VM so it shouldn't touch the page.lru field. + The PG_isolated flag is aliased with the PG_reclaim flag so drivers + shouldn't use PG_isolated for its own purposes. + +Monitoring Migration +===================== + +The following events (counters) can be used to monitor page migration. + +1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a + page was migrated. If the page was a non-THP and non-hugetlb page, then + this counter is increased by one. If the page was a THP or hugetlb, then + this counter is increased by the number of THP or hugetlb subpages. + For example, migration of a single 2MB THP that has 4KB-size base pages + (subpages) will cause this counter to increase by 512. + +2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for + PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, + if it was a THP or hugetlb. + +3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. + +4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split. + +5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had + to be split. After splitting, a migration retry was used for it's sub-pages. + +THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or +PGMIGRATE_FAIL events. For example, a THP migration failure will cause both +THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. + +Christoph Lameter, May 8, 2006. +Minchan Kim, Mar 28, 2016. diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst new file mode 100644 index 000000000000..f5c954afe97c --- /dev/null +++ b/Documentation/mm/page_owner.rst @@ -0,0 +1,196 @@ +.. _page_owner: + +================================================== +page owner: Tracking about who allocated each page +================================================== + +Introduction +============ + +page owner is for the tracking about who allocated each page. +It can be used to debug memory leak or to find a memory hogger. +When allocation happens, information about allocation such as call stack +and order of pages is stored into certain storage for each page. +When we need to know about status of all pages, we can get and analyze +this information. + +Although we already have tracepoint for tracing page allocation/free, +using it for analyzing who allocate each page is rather complex. We need +to enlarge the trace buffer for preventing overlapping until userspace +program launched. And, launched program continually dump out the trace +buffer for later analysis and it would change system behaviour with more +possibility rather than just keeping it in memory, so bad for debugging. + +page owner can also be used for various purposes. For example, accurate +fragmentation statistics can be obtained through gfp flag information of +each page. It is already implemented and activated if page owner is +enabled. Other usages are more than welcome. + +page owner is disabled by default. So, if you'd like to use it, you need +to add "page_owner=on" to your boot cmdline. If the kernel is built +with page owner and page owner is disabled in runtime due to not enabling +boot option, runtime overhead is marginal. If disabled in runtime, it +doesn't require memory to store owner information, so there is no runtime +memory overhead. And, page owner inserts just two unlikely branches into +the page allocator hotpath and if not enabled, then allocation is done +like as the kernel without page owner. These two unlikely branches should +not affect to allocation performance, especially if the static keys jump +label patching functionality is available. Following is the kernel's code +size change due to this facility. + +- Without page owner:: + + text data bss dec hex filename + 48392 2333 644 51369 c8a9 mm/page_alloc.o + +- With page owner:: + + text data bss dec hex filename + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o + +Although, roughly, 8 KB code is added in total, page_alloc.o increase by +520 bytes and less than half of it is in hotpath. Building the kernel with +page owner and turning it on if needed would be great option to debug +kernel memory problem. + +There is one notice that is caused by implementation detail. page owner +stores information into the memory from struct page extension. This memory +is initialized some time later than that page allocator starts in sparse +memory system, so, until initialization, many pages can be allocated and +they would have no owner information. To fix it up, these early allocated +pages are investigated and marked as allocated in initialization phase. +Although it doesn't mean that they have the right owner information, +at least, we can tell whether the page is allocated or not, +more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages +are catched and marked, although they are mostly allocated from struct +page extension feature. Anyway, after that, no page is left in +un-tracking state. + +Usage +===== + +1) Build user-space helper:: + + cd tools/vm + make page_owner_sort + +2) Enable page owner: add "page_owner=on" to boot cmdline. + +3) Do the job that you want to debug. + +4) Analyze information from page owner:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + + The general output of ``page_owner_full.txt`` is as follows:: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows + in buf, uses regexp to extract the page order value, counts the times + and pages of buf, and finally sorts them according to the parameter(s). + + See the result about who allocated each page + in the ``sorted_page_owner.txt``. General output:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + By default, ``page_owner_sort`` is sorted according to the times of buf. + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: + + fundamental function:: + + Sort: + -a Sort by memory allocation time. + -m Sort by total memory. + -p Sort by pid. + -P Sort by tgid. + -n Sort by task command name. + -r Sort by memory release time. + -s Sort by stack trace. + -t Sort by times (default). + --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. + Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is + optional since default direction is increasing numerical or lexicographic + order. Mixed use of abbreviated and complete-form of keys is allowed. + + Examples: + ./page_owner_sort --sort=n,+pid,-tgid + ./page_owner_sort --sort=at + + additional function:: + + Cull: + --cull + Specify culling rules.Culling syntax is key[,key[,...]].Choose a + multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. + + is a single argument in the form of a comma-separated list, + which offers a way to specify individual culling rules. The recognized + keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. + can be specified by the sequence of keys k1,k2, ..., as described in + the STANDARD SORT KEYS section below. Mixed use of abbreviated and + complete-form of keys is allowed. + + Examples: + ./page_owner_sort --cull=stacktrace + ./page_owner_sort --cull=st,pid,name + ./page_owner_sort --cull=n,f + + Filter: + -f Filter out the information of blocks whose memory has been released. + + Select: + --pid Select by pid. This selects the blocks whose process ID + numbers appear in . + --tgid Select by tgid. This selects the blocks whose thread + group ID numbers appear in . + --name Select by task command name. This selects the blocks whose + task command name appear in . + + , , are single arguments in the form of a comma-separated list, + which offers a way to specify individual selecting rules. + + + Examples: + ./page_owner_sort --pid=1 + ./page_owner_sort --tgid=1,2,3 + ./page_owner_sort --name name1,name2 + +STANDARD FORMAT SPECIFIERS +========================== +:: + + For --sort option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + st stacktrace stack trace of the page allocation + T txt full text of block + ft free_ts timestamp of the page when it was released + at alloc_ts timestamp of the page when it was allocated + ator allocator memory allocator for pages + + For --curl option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + f free whether the page has been released or not + st stacktrace stack trace of the page allocation + ator allocator memory allocator for pages diff --git a/Documentation/mm/page_reclaim.rst b/Documentation/mm/page_reclaim.rst new file mode 100644 index 000000000000..50a30b7f8ac3 --- /dev/null +++ b/Documentation/mm/page_reclaim.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Page Reclaim +============ diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst new file mode 100644 index 000000000000..1a09472f10a3 --- /dev/null +++ b/Documentation/mm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _page_table_check: + +================ +Page Table Check +================ + +Introduction +============ + +Page table check allows to harden the kernel by ensuring that some types of +the memory corruptions are prevented. + +Page table check performs extra verifications at the time when new pages become +accessible from the userspace by getting their page table entries (PTEs PMDs +etc.) added into the table. + +In case of detected corruption, the kernel is crashed. There is a small +performance and memory overhead associated with the page table check. Therefore, +it is disabled by default, but can be optionally enabled on systems where the +extra hardening outweighs the performance costs. Also, because page table check +is synchronous, it can help with debugging double map memory corruption issues, +by crashing kernel at the time wrong mapping occurs instead of later which is +often the case with memory corruptions bugs. + +Double mapping detection logic +============================== + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +Enabling Page Table Check +========================= + +Build kernel with: + +- PAGE_TABLE_CHECK=y + Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK + is available. + +- Boot with 'page_table_check=on' kernel parameter. + +Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page +table support without extra kernel parameter. diff --git a/Documentation/mm/page_tables.rst b/Documentation/mm/page_tables.rst new file mode 100644 index 000000000000..96939571d7bc --- /dev/null +++ b/Documentation/mm/page_tables.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +Page Tables +=========== diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst new file mode 100644 index 000000000000..2ab7b8c1c863 --- /dev/null +++ b/Documentation/mm/physical_memory.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Physical Memory +=============== diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst new file mode 100644 index 000000000000..e8618fbc62c9 --- /dev/null +++ b/Documentation/mm/process_addrs.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Process Addresses +================= diff --git a/Documentation/mm/remap_file_pages.rst b/Documentation/mm/remap_file_pages.rst new file mode 100644 index 000000000000..7bef6718e3a9 --- /dev/null +++ b/Documentation/mm/remap_file_pages.rst @@ -0,0 +1,33 @@ +.. _remap_file_pages: + +============================== +remap_file_pages() system call +============================== + +The remap_file_pages() system call is used to create a nonlinear mapping, +that is, a mapping in which the pages of the file are mapped into a +nonsequential order in memory. The advantage of using remap_file_pages() +over using repeated calls to mmap(2) is that the former approach does not +require the kernel to create additional VMA (Virtual Memory Area) data +structures. + +Supporting of nonlinear mapping requires significant amount of non-trivial +code in kernel virtual memory subsystem including hot paths. Also to get +nonlinear mapping work kernel need a way to distinguish normal page table +entries from entries with file offset (pte_file). Kernel reserves flag in +PTE for this purpose. PTE flags are scarce resource especially on some CPU +architectures. It would be nice to free up the flag for other usage. + +Fortunately, there are not many users of remap_file_pages() in the wild. +It's only known that one enterprise RDBMS implementation uses the syscall +on 32-bit systems to map files bigger than can linearly fit into 32-bit +virtual address space. This use-case is not critical anymore since 64-bit +systems are widely available. + +The syscall is deprecated and replaced it with an emulation now. The +emulation creates new VMAs instead of nonlinear mappings. It's going to +work slower for rare users of remap_file_pages() but ABI is preserved. + +One side effect of emulation (apart from performance) is that user can hit +vm.max_map_count limit more easily due to additional VMAs. See comment for +DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/mm/shmfs.rst b/Documentation/mm/shmfs.rst new file mode 100644 index 000000000000..8b01ebb4c30e --- /dev/null +++ b/Documentation/mm/shmfs.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +Shared Memory Filesystem +======================== diff --git a/Documentation/mm/slab.rst b/Documentation/mm/slab.rst new file mode 100644 index 000000000000..87d5a5bb172f --- /dev/null +++ b/Documentation/mm/slab.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Slab Allocation +=============== diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst new file mode 100644 index 000000000000..43063ade737a --- /dev/null +++ b/Documentation/mm/slub.rst @@ -0,0 +1,452 @@ +.. _slub: + +========================== +Short users guide for SLUB +========================== + +The basic philosophy of SLUB is very different from SLAB. SLAB +requires rebuilding the kernel to activate debug options for all +slab caches. SLUB always includes full debugging but it is off by default. +SLUB can enable debugging only for selected slabs in order to avoid +an impact on overall system performance which may make a bug more +difficult to find. + +In order to switch debugging on one can add an option ``slub_debug`` +to the kernel command line. That will enable full debugging for +all slabs. + +Typically one would then use the ``slabinfo`` command to get statistical +data and perform operation on the slabs. By default ``slabinfo`` only lists +slabs that have data in them. See "slabinfo -h" for more options when +running the command. ``slabinfo`` can be compiled with +:: + + gcc -o slabinfo tools/vm/slabinfo.c + +Some of the modes of operation of ``slabinfo`` require that slub debugging +be enabled on the command line. F.e. no tracking information will be +available without debugging on and validation can only partially +be performed if debugging was not switched on. + +Some more sophisticated uses of slub_debug: +------------------------------------------- + +Parameters may be given to ``slub_debug``. If none is specified then full +debugging is enabled. Format: + +slub_debug= + Enable options for all slabs + +slub_debug=,,,... + Enable options only for select slabs (no spaces + after a comma) + +Multiple blocks of options for all slabs or selected slabs can be given, with +blocks of options delimited by ';'. The last of "all slabs" blocks is applied +to all slabs except those that match one of the "select slabs" block. Options +of the first "select slabs" blocks that matches the slab's name are applied. + +Possible debug options are:: + + F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS + Sorry SLAB legacy issues) + Z Red zoning + P Poisoning (object and padding) + U User tracking (free and alloc) + T Trace (please only use on single slabs) + A Enable failslab filter mark for the cache + O Switch debugging off for caches that would have + caused higher minimum slab orders + - Switch all debugging off (useful if the kernel is + configured with CONFIG_SLUB_DEBUG_ON) + +F.e. in order to boot just with sanity checks and red zoning one would specify:: + + slub_debug=FZ + +Trying to find an issue in the dentry cache? Try:: + + slub_debug=,dentry + +to only enable debugging on the dentry cache. You may use an asterisk at the +end of the slab name, in order to cover all slabs with the same prefix. For +example, here's how you can poison the dentry cache as well as all kmalloc +slabs:: + + slub_debug=P,kmalloc-*,dentry + +Red zoning and tracking may realign the slab. We can just apply sanity checks +to the dentry cache with:: + + slub_debug=F,dentry + +Debugging options may require the minimum possible slab order to increase as +a result of storing the metadata (for example, caches with PAGE_SIZE object +sizes). This has a higher liklihood of resulting in slab allocation errors +in low memory situations or if there's high fragmentation of memory. To +switch off debugging for such caches by default, use:: + + slub_debug=O + +You can apply different options to different list of slab names, using blocks +of options. This will enable red zoning for dentry and user tracking for +kmalloc. All other slabs will not get any debugging enabled:: + + slub_debug=Z,dentry;U,kmalloc-* + +You can also enable options (e.g. sanity checks and poisoning) for all caches +except some that are deemed too performance critical and don't need to be +debugged by specifying global debug options followed by a list of slab names +with "-" as options:: + + slub_debug=FZ;-,zs_handle,zspage + +The state of each debug option for a slab can be found in the respective files +under:: + + /sys/kernel/slab// + +If the file contains 1, the option is enabled, 0 means disabled. The debug +options from the ``slub_debug`` parameter translate to the following files:: + + F sanity_checks + Z red_zone + P poison + U store_user + T trace + A failslab + +Careful with tracing: It may spew out lots of information and never stop if +used on the wrong slab. + +Slab merging +============ + +If no debug options are specified then SLUB may merge similar slabs together +in order to reduce overhead and increase cache hotness of objects. +``slabinfo -a`` displays which slabs were merged together. + +Slab validation +=============== + +SLUB can validate all object if the kernel was booted with slub_debug. In +order to do so you must have the ``slabinfo`` tool. Then you can do +:: + + slabinfo -v + +which will test all objects. Output will be generated to the syslog. + +This also works in a more limited way if boot was without slab debug. +In that case ``slabinfo -v`` simply tests all reachable objects. Usually +these are in the cpu slabs and the partial slabs. Full slabs are not +tracked by SLUB in a non debug situation. + +Getting more performance +======================== + +To some degree SLUB's performance is limited by the need to take the +list_lock once in a while to deal with partial slabs. That overhead is +governed by the order of the allocation for each slab. The allocations +can be influenced by kernel parameters: + +.. slub_min_objects=x (default 4) +.. slub_min_order=x (default 0) +.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) + +``slub_min_objects`` + allows to specify how many objects must at least fit into one + slab in order for the allocation order to be acceptable. In + general slub will be able to perform this number of + allocations on a slab without consulting centralized resources + (list_lock) where contention may occur. + +``slub_min_order`` + specifies a minimum order of slabs. A similar effect like + ``slub_min_objects``. + +``slub_max_order`` + specified the order at which ``slub_min_objects`` should no + longer be checked. This is useful to avoid SLUB trying to + generate super large order pages to fit ``slub_min_objects`` + of a slab cache with large object sizes into one high order + page. Setting command line parameter + ``debug_guardpage_minorder=N`` (N > 0), forces setting + ``slub_max_order`` to 0, what cause minimum possible order of + slabs allocation. + +SLUB Debug output +================= + +Here is a sample of slub debug output:: + + ==================================================================== + BUG kmalloc-8: Right Redzone overwritten + -------------------------------------------------------------------- + + INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc + INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 + INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 + INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 + + Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ + Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 + Redzone (0xc90f6d28): 00 cc cc cc . + Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ + + [] dump_trace+0x63/0x1eb + [] show_trace_log_lvl+0x1a/0x2f + [] show_trace+0x12/0x14 + [] dump_stack+0x16/0x18 + [] object_err+0x143/0x14b + [] check_object+0x66/0x234 + [] __slab_free+0x239/0x384 + [] kfree+0xa6/0xc6 + [] get_modalias+0xb9/0xf5 + [] dmi_dev_uevent+0x27/0x3c + [] dev_uevent+0x1ad/0x1da + [] kobject_uevent_env+0x20a/0x45b + [] kobject_uevent+0xa/0xf + [] store_uevent+0x4f/0x58 + [] dev_attr_store+0x29/0x2f + [] sysfs_write_file+0x16e/0x19c + [] vfs_write+0xd1/0x15a + [] sys_write+0x3d/0x72 + [] sysenter_past_esp+0x5f/0x99 + [] 0xb7f7b410 + ======================= + + FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc + +If SLUB encounters a corrupted object (full detection requires the kernel +to be booted with slub_debug) then the following output will be dumped +into the syslog: + +1. Description of the problem encountered + + This will be a message in the system log starting with:: + + =============================================== + BUG : + ----------------------------------------------- + + INFO: - + INFO: Slab
+ INFO: Object
+ INFO: Allocated in age= cpu= pid= + INFO: Freed in age= cpu= + pid= + + (Object allocation / free information is only available if SLAB_STORE_USER is + set for the slab. slub_debug sets that option) + +2. The object contents if an object was involved. + + Various types of lines can follow the BUG SLUB line: + + Bytes b4
: + Shows a few bytes before the object where the problem was detected. + Can be useful if the corruption does not stop with the start of the + object. + + Object
: + The bytes of the object. If the object is inactive then the bytes + typically contain poison values. Any non-poison value shows a + corruption by a write after free. + + Redzone
: + The Redzone following the object. The Redzone is used to detect + writes after the object. All bytes should always have the same + value. If there is any deviation then it is due to a write after + the object boundary. + + (Redzone information is only available if SLAB_RED_ZONE is set. + slub_debug sets that option) + + Padding
: + Unused data to fill up the space in order to get the next object + properly aligned. In the debug case we make sure that there are + at least 4 bytes of padding. This allows the detection of writes + before the object. + +3. A stackdump + + The stackdump describes the location where the error was detected. The cause + of the corruption is may be more likely found by looking at the function that + allocated or freed the object. + +4. Report on how the problem was dealt with in order to ensure the continued + operation of the system. + + These are messages in the system log beginning with:: + + FIX : + + In the above sample SLUB found that the Redzone of an active object has + been overwritten. Here a string of 8 characters was written into a slab that + has the length of 8 characters. However, a 8 character string needs a + terminating 0. That zero has overwritten the first byte of the Redzone field. + After reporting the details of the issue encountered the FIX SLUB message + tells us that SLUB has restored the Redzone to its proper value and then + system operations continue. + +Emergency operations +==================== + +Minimal debugging (sanity checks alone) can be enabled by booting with:: + + slub_debug=F + +This will be generally be enough to enable the resiliency features of slub +which will keep the system running even if a bad kernel component will +keep corrupting objects. This may be important for production systems. +Performance will be impacted by the sanity checks and there will be a +continual stream of error messages to the syslog but no additional memory +will be used (unlike full debugging). + +No guarantees. The kernel component still needs to be fixed. Performance +may be optimized further by locating the slab that experiences corruption +and enabling debugging only for that cache + +I.e.:: + + slub_debug=F,dentry + +If the corruption occurs by writing after the end of the object then it +may be advisable to enable a Redzone to avoid corrupting the beginning +of other objects:: + + slub_debug=FZ,dentry + +Extended slabinfo mode and plotting +=================================== + +The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: + - Slabcache Totals + - Slabs sorted by size (up to -N slabs, default 1) + - Slabs sorted by loss (up to -N slabs, default 1) + +Additionally, in this mode ``slabinfo`` does not dynamically scale +sizes (G/M/K) and reports everything in bytes (this functionality is +also available to other slabinfo modes via '-B' option) which makes +reporting more precise and accurate. Moreover, in some sense the `-X' +mode also simplifies the analysis of slabs' behaviour, because its +output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it +pushes the analysis from looking through the numbers (tons of numbers) +to something easier -- visual analysis. + +To generate plots: + +a) collect slabinfo extended records, for example:: + + while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done + +b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: + + slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] + + The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records + and generates 3 png files (and 3 pre-processing cache files) per STATS + file: + - Slabcache Totals: FOO_STATS-totals.png + - Slabs sorted by size: FOO_STATS-slabs-by-size.png + - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png + +Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you +need to compare slabs' behaviour "prior to" and "after" some code +modification. To help you out there, ``slabinfo-gnuplot.sh`` script +can 'merge' the `Slabcache Totals` sections from different +measurements. To visually compare N plots: + +a) Collect as many STATS1, STATS2, .. STATSN files as you need:: + + while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done + +b) Pre-process those STATS files:: + + slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN + +c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the + generated pre-processed \*-totals:: + + slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals + + This will produce a single plot (png file). + + Plots, expectedly, can be large so some fluctuations or small spikes + can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two + options to 'zoom-in'/'zoom-out': + + a) ``-s %d,%d`` -- overwrites the default image width and height + b) ``-r %d,%d`` -- specifies a range of samples to use (for example, + in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r + 40,60`` range will plot only samples collected between 40th and + 60th seconds). + + +DebugFS files for SLUB +====================== + +For more information about current state of SLUB caches with the user tracking +debug option enabled, debugfs files are available, typically under +/sys/kernel/debug/slab// (created only for caches with enabled user +tracking). There are 2 types of these files with the following debug +information: + +1. alloc_traces:: + + Prints information about unique allocation traces of the currently + allocated objects. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, allocating function, minimal/average/maximal jiffies since alloc, + pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. + + Example::: + + 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: + __slab_alloc+0x6d/0x90 + kmem_cache_alloc_trace+0x2eb/0x300 + populate_error_injection_list+0x97/0x110 + init_error_injection+0x1b/0x71 + do_one_initcall+0x5f/0x2d0 + kernel_init_freeable+0x26f/0x2d7 + kernel_init+0xe/0x118 + ret_from_fork+0x22/0x30 + + +2. free_traces:: + + Prints information about unique freeing traces of the currently allocated + objects. The freeing traces thus come from the previous life-cycle of the + objects and are reported as not available for objects allocated for the first + time. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, freeing function, minimal/average/maximal jiffies since free, + pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. + + Example::: + + 1980 age=4294912290 pid=0 cpus=0 + 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 + kfree+0x2db/0x420 + acpi_ut_update_ref_count+0x6a6/0x782 + acpi_ut_update_object_reference+0x1ad/0x234 + acpi_ut_remove_reference+0x7d/0x84 + acpi_rs_get_prt_method_data+0x97/0xd6 + acpi_get_irq_routing_table+0x82/0xc4 + acpi_pci_irq_find_prt_entry+0x8e/0x2e0 + acpi_pci_irq_lookup+0x3a/0x1e0 + acpi_pci_irq_enable+0x77/0x240 + pcibios_enable_device+0x39/0x40 + do_pci_enable_device.part.0+0x5d/0xe0 + pci_enable_device_flags+0xfc/0x120 + pci_enable_device+0x13/0x20 + virtio_pci_probe+0x9e/0x170 + local_pci_probe+0x48/0x80 + pci_device_probe+0x105/0x1c0 + +Christoph Lameter, May 30, 2007 +Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst new file mode 100644 index 000000000000..c08919662704 --- /dev/null +++ b/Documentation/mm/split_page_table_lock.rst @@ -0,0 +1,100 @@ +.. _split_page_table_lock: + +===================== +Split page table lock +===================== + +Originally, mm->page_table_lock spinlock protected all page tables of the +mm_struct. But this approach leads to poor page fault scalability of +multi-threaded applications due high contention on the lock. To improve +scalability, split page table lock was introduced. + +With split page table lock we have separate per-table lock to serialize +access to the table. At the moment we use split lock for PTE and PMD +tables. Access to higher level tables protected by mm->page_table_lock. + +There are helpers to lock/unlock a table and other accessor functions: + + - pte_offset_map_lock() + maps pte and takes PTE table lock, returns pointer to the taken + lock; + - pte_unmap_unlock() + unlocks and unmaps PTE table; + - pte_alloc_map_lock() + allocates PTE table if needed and take the lock, returns pointer + to taken lock or NULL if allocation failed; + - pte_lockptr() + returns pointer to PTE table lock; + - pmd_lock() + takes PMD table lock, returns pointer to taken lock; + - pmd_lockptr() + returns pointer to PMD table lock; + +Split page table lock for PTE tables is enabled compile-time if +CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. +If split lock is disabled, all tables are guarded by mm->page_table_lock. + +Split page table lock for PMD tables is enabled, if it's enabled for PTE +tables and the architecture supports it (see below). + +Hugetlb and split page table lock +================================= + +Hugetlb can support several page sizes. We use split lock only for PMD +level, but not for PUD. + +Hugetlb-specific helpers: + + - huge_pte_lock() + takes pmd split lock for PMD_SIZE page, mm->page_table_lock + otherwise; + - huge_pte_lockptr() + returns pointer to table lock; + +Support of split page table lock by an architecture +=================================================== + +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. + +Make sure the architecture doesn't use slab allocator for page table +allocation: slab uses page->slab_cache for its pages. +This field shares storage with page->ptl. + +PMD split lock only makes sense if you have more than two page table +levels. + +PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table +allocation and pgtable_pmd_page_dtor() on freeing. + +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and +pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing +paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). + +With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. + +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +be handled properly. + +page->ptl +========= + +page->ptl is used to access split page table lock, where 'page' is struct +page of page containing the table. It shares storage with page->private +(and few other fields in union). + +To avoid increasing size of struct page and have best performance, we use a +trick: + + - if spinlock_t fits into long, we use page->ptr as spinlock, so we + can avoid indirect access and save a cache line. + - if size of spinlock_t is bigger then size of long, we use page->ptl as + pointer to spinlock_t and allocate it dynamically. This allows to use + split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs + one more cache line for indirect access; + +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in +pgtable_pmd_page_ctor() for PMD table. + +Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/mm/swap.rst b/Documentation/mm/swap.rst new file mode 100644 index 000000000000..78819bd4d745 --- /dev/null +++ b/Documentation/mm/swap.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +Swap +==== diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst new file mode 100644 index 000000000000..216db1d67d04 --- /dev/null +++ b/Documentation/mm/transhuge.rst @@ -0,0 +1,187 @@ +.. _transhuge: + +============================ +Transparent Hugepage Support +============================ + +This document describes design principles for Transparent Hugepage (THP) +support and its interaction with other parts of the memory management +system. + +Design principles +================= + +- "graceful fallback": mm components which don't have transparent hugepage + knowledge fall back to breaking huge pmd mapping into table of ptes and, + if necessary, split a transparent hugepage. Therefore these components + can continue working on the regular pages or regular pte mappings. + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +get_user_pages and follow_page +============================== + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most GUP users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead. Taking a reference on any head/tail page would +prevent the page from being split by anyone. + +.. note:: + these aren't new constraints to the GUP API, and they match the + same constraints that apply to hugetlbfs too, so any driver capable + of handling GUP on hugetlbfs will also work fine on transparent + hugepage backed mappings. + +Graceful fallback +================= + +Code walking pagetables but unaware about huge pmds can simply call +split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundreds if not thousands of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +that you can't handle natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. split_huge_page() can fail +if the page is pinned and you must handle this correctly. + +Example to make mremap.c transparent hugepage aware with a one liner +change:: + + diff --git a/mm/mremap.c b/mm/mremap.c + --- a/mm/mremap.c + +++ b/mm/mremap.c + @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); + + split_huge_pmd(vma, pmd, addr); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +Locking in hugepage aware code +============================== + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_lock in read (or write) mode to be sure a huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_lock in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the +page table lock will prevent the huge pmd being converted into a +regular pmd from under you (split_huge_pmd can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page table lock and fallback to the old code as +before. Otherwise, you can proceed to process the huge pmd and the +hugepage natively. Once finished, you can drop the page table lock. + +Refcounts and transparent huge pages +==================================== + +Refcounting on THP is mostly consistent with refcounting on other compound +pages: + + - get_page()/put_page() and GUP operate on head page's ->_refcount. + + - ->_refcount in tail pages is always zero: get_page_unless_zero() never + succeeds on tail pages. + + - map/unmap of the pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page. + + - map/unmap of the whole compound page is accounted for in compound_mapcount + (stored in first tail page). For file huge pages, we also increment + ->_mapcount of all sub-pages in order to have race-free detection of + last unmap of subpages. + +PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. + +For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all +subpages is offset up by one. This additional reference is required to +get race-free detection of unmap of subpages when we have them mapped with +both PMDs and PTEs. + +This optimization is required to lower the overhead of per-subpage mapcount +tracking. The alternative is to alter ->_mapcount in all subpages on each +map/unmap of the whole compound page. + +For anonymous pages, we set PG_double_map when a PMD of the page is split +for the first time, but still have a PMD mapping. The additional references +go away with the last compound_mapcount. + +File pages get PG_double_map set on the first map of the page with PTE and +goes away when the page gets evicted from the page cache. + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the page +structures. It can be done easily for refcounts taken by page table +entries, but we don't have enough information on how to distribute any +additional pins (i.e. from get_user_pages). split_huge_page() fails any +requests to split pinned huge pages: it expects page count to be equal to +the sum of mapcount of all sub-pages plus one (split_huge_page caller must +have a reference to the head page). + +split_huge_page uses migration entries to stabilize page->_refcount and +page->_mapcount of anonymous pages. File pages just get unmapped. + +We are safe against physical memory scanners too: the only legitimate way +a scanner can get a reference to a page is get_page_unless_zero(). + +All tail pages have zero ->_refcount until atomic_add(). This prevents the +scanner from getting a reference to the tail page up to that point. After the +atomic_add() we don't care about the ->_refcount value. We already know how +many references should be uncharged from the head page. + +For head page get_page_unless_zero() will succeed and we don't mind. It's +clear where references should go after split: it will stay on the head page. + +Note that split_huge_pmd() doesn't have any limitations on refcounting: +pmd can be split at any point and never fails. + +Partial unmap and deferred_split_huge_page() +============================================ + +Unmapping part of THP (with munmap() or other way) is not going to free +memory immediately. Instead, we detect that a subpage of THP is not in use +in page_remove_rmap() and queue the THP for splitting if memory pressure +comes. Splitting will free up unused subpages. + +Splitting the page right away is not an option due to locking context in +the place where we can detect partial unmap. It also might be +counterproductive since in many cases partial unmap happens during exit(2) if +a THP crosses a VMA boundary. + +The function deferred_split_huge_page() is used to queue a page for splitting. +The splitting itself will happen when we get memory pressure via shrinker +interface. diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst new file mode 100644 index 000000000000..b280367d6a44 --- /dev/null +++ b/Documentation/mm/unevictable-lru.rst @@ -0,0 +1,554 @@ +.. _unevictable_lru: + +============================== +Unevictable LRU Infrastructure +============================== + +.. contents:: :local: + + +Introduction +============ + +This document describes the Linux memory manager's "Unevictable LRU" +infrastructure and the use of this to manage several types of "unevictable" +pages. + +The document attempts to provide the overall rationale behind this mechanism +and the rationale for some of the design decisions that drove the +implementation. The latter design rationale is discussed in the context of an +implementation description. Admittedly, one can obtain the implementation +details - the "what does it do?" - by reading the code. One hopes that the +descriptions below add value by provide the answer to "why does it do that?". + + + +The Unevictable LRU +=================== + +The Unevictable LRU facility adds an additional LRU list to track unevictable +pages and to hide these pages from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with page +reclaim in Linux. The problems have been observed at customer sites on large +memory x86_64 systems. + +To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of +main memory will have over 32 million 4k pages in a single node. When a large +fraction of these pages are not evictable for any reason [see below], vmscan +will spend a lot of time scanning the LRU lists looking for the small fraction +of pages that are evictable. This can result in a situation where all CPUs are +spending 100% of their time in vmscan for hours or days on end, with the system +completely unresponsive. + +The unevictable list addresses the following classes of unevictable pages: + + * Those owned by ramfs. + + * Those mapped into SHM_LOCK'd shared memory regions. + + * Those mapped into VM_LOCKED [mlock()ed] VMAs. + +The infrastructure may also be able to handle other conditions that make pages +unevictable, either by definition or by circumstance, in the future. + + +The Unevictable LRU Page List +----------------------------- + +The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a +companion to the LRU-ordered anonymous and file, active and inactive page lists; +and now it is not even a page list. But following familiar convention, here in +this document and in the source, we often imagine it as a fifth LRU page list. + +The Unevictable LRU infrastructure consists of an additional, per-node, LRU list +called the "unevictable" list and an associated page flag, PG_unevictable, to +indicate that the page is being managed on the unevictable list. + +The PG_unevictable flag is analogous to, and mutually exclusive with, the +PG_active flag in that it indicates on which LRU list a page resides when +PG_lru is set. + +The Unevictable LRU infrastructure maintains unevictable pages as if they were +on an additional LRU list for a few reasons: + + (1) We get to "treat unevictable pages just like we treat other pages in the + system - which means we get to use the same code to manipulate them, the + same code to isolate them (for migrate, etc.), the same code to keep track + of the statistics, etc..." [Rik van Riel] + + (2) We want to be able to migrate unevictable pages between nodes for memory + defragmentation, workload management and memory hotplug. The Linux kernel + can only migrate pages that it can successfully isolate from the LRU + lists (or "Movable" pages: outside of consideration here). If we were to + maintain pages elsewhere than on an LRU-like list, where they can be + detected by isolate_lru_page(), we would prevent their migration. + +The unevictable list does not differentiate between file-backed and anonymous, +swap-backed pages. This differentiation is only important while the pages are, +in fact, evictable. + +The unevictable list benefits from the "arrayification" of the per-node LRU +lists and statistics originally proposed and posted by Christoph Lameter. + + +Memory Control Group Interaction +-------------------------------- + +The unevictable LRU facility interacts with the memory control group [aka +memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by +extending the lru_list enum. + +The memory controller data structure automatically gets a per-node unevictable +list as a result of the "arrayification" of the per-node LRU lists (one per +lru_list enum element). The memory controller tracks the movement of pages to +and from the unevictable list. + +When a memory control group comes under memory pressure, the controller will +not attempt to reclaim pages on the unevictable list. This has a couple of +effects: + + (1) Because the pages are "hidden" from reclaim on the unevictable list, the + reclaim process can be more efficient, dealing only with pages that have a + chance of being reclaimed. + + (2) On the other hand, if too many of the pages charged to the control group + are unevictable, the evictable portion of the working set of the tasks in + the control group may not fit into the available memory. This can cause + the control group to thrash or to OOM-kill tasks. + + +.. _mark_addr_space_unevict: + +Marking Address Spaces Unevictable +---------------------------------- + +For facilities such as ramfs none of the pages attached to the address space +may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE +address space flag is provided, and this can be manipulated by a filesystem +using a number of wrapper functions: + + * ``void mapping_set_unevictable(struct address_space *mapping);`` + + Mark the address space as being completely unevictable. + + * ``void mapping_clear_unevictable(struct address_space *mapping);`` + + Mark the address space as being evictable. + + * ``int mapping_unevictable(struct address_space *mapping);`` + + Query the address space, and return true if it is completely + unevictable. + +These are currently used in three places in the kernel: + + (1) By ramfs to mark the address spaces of its inodes when they are created, + and this mark remains for the life of the inode. + + (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. + Note that SHM_LOCK is not required to page in the locked pages if they're + swapped out; the application must touch the pages manually if it wants to + ensure they're in memory. + + (3) By the i915 driver to mark pinned address space until it's unpinned. The + amount of unevictable memory marked by i915 driver is roughly the bounded + object size in debugfs/dri/0/i915_gem_objects. + + +Detecting Unevictable Pages +--------------------------- + +The function page_evictable() in mm/internal.h determines whether a page is +evictable or not using the query function outlined above [see section +:ref:`Marking address spaces unevictable `] +to check the AS_UNEVICTABLE flag. + +For address spaces that are so marked after being populated (as SHM regions +might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate +the page tables for the region as does, for example, mlock(), nor need it make +any special effort to push any pages in the SHM_LOCK'd area to the unevictable +list. Instead, vmscan will do this if and when it encounters the pages during +a reclamation scan. + +On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan +the pages in the region and "rescue" them from the unevictable list if no other +condition is keeping them unevictable. If an unevictable region is destroyed, +the pages are also "rescued" from the unevictable list in the process of +freeing them. + +page_evictable() also checks for mlocked pages by testing an additional page +flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is +faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. + + +Vmscan's Handling of Unevictable Pages +-------------------------------------- + +If unevictable pages are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the pages until they +have become evictable again (via munlock() for example) and have been "rescued" +from the unevictable list. However, there may be situations where we decide, +for the sake of expediency, to leave an unevictable page on one of the regular +active/inactive LRU lists for vmscan to deal with. vmscan checks for such +pages in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such pages that it encounters: that is, it diverts those pages to the +unevictable list for the memory cgroup and node being scanned. + +There may be situations where a page is mapped into a VM_LOCKED VMA, but the +page is not marked as PG_mlocked. Such pages will make it all the way to +shrink_active_list() or shrink_page_list() where they will be detected when +vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page +is culled to the unevictable list when it is released by the shrinker. + +To "cull" an unevictable page, vmscan simply puts the page back on the LRU list +using putback_lru_page() - the inverse operation to isolate_lru_page() - after +dropping the page lock. Because the condition which makes the page unevictable +may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the +unevictable state of a page before placing it on the unevictable list. + + +MLOCKED Pages +============= + +The unevictable page list is also useful for mlock(), in addition to ramfs and +SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in +NOMMU situations, all mappings are effectively mlocked. + + +History +------- + +The "Unevictable mlocked Pages" infrastructure is based on work originally +posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". +Nick posted his patch as an alternative to a patch posted by Christoph Lameter +to achieve the same objective: hiding mlocked pages from vmscan. + +In Nick's patch, he used one of the struct page LRU list link fields as a count +of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years +earlier). But this use of the link field for a count prevented the management +of the pages on an LRU list, and thus mlocked pages were not migratable as +isolate_lru_page() could not detect them, and the LRU list link field was not +available to the migration subsystem. + +Nick resolved this by putting mlocked pages back on the LRU list before +attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When +Nick's patch was integrated with the Unevictable LRU work, the count was +replaced by walking the reverse map when munlocking, to determine whether any +other VM_LOCKED VMAs still mapped the page. + +However, walking the reverse map for each page when munlocking was ugly and +inefficient, and could lead to catastrophic contention on a file's rmap lock, +when many processes which had it mlocked were trying to exit. In 5.18, the +idea of keeping mlock_count in Unevictable LRU list link field was revived and +put to work, without preventing the migration of mlocked pages. This is why +the "Unevictable LRU list" cannot be a linked list of pages now; but there was +no use for that linked list anyway - though its size is maintained for meminfo. + + +Basic Management +---------------- + +mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable +pages. When such a page has been "noticed" by the memory management subsystem, +the page is marked with the PG_mlocked flag. This can be manipulated using the +PageMlocked() functions. + +A PG_mlocked page will be placed on the unevictable list when it is added to +the LRU. Such pages can be "noticed" by memory management in several places: + + (1) in the mlock()/mlock2()/mlockall() system call handlers; + + (2) in the mmap() system call handler when mmapping a region with the + MAP_LOCKED flag; + + (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE + flag; + + (4) in the fault path and when a VM_LOCKED stack segment is expanded; or + + (5) as mentioned above, in vmscan:shrink_page_list() when attempting to + reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap(). + +mlocked pages become unlocked and rescued from the unevictable list when: + + (1) mapped in a range unlocked via the munlock()/munlockall() system calls; + + (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including + unmapping at task exit; + + (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; + or + + (4) before a page is COW'd in a VM_LOCKED VMA. + + +mlock()/mlock2()/mlockall() System Call Handling +------------------------------------------------ + +mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup() +for each VMA in the range specified by the call. In the case of mlockall(), +this is the entire active address space of the task. Note that mlock_fixup() +is used for both mlocking and munlocking a range of memory. A call to mlock() +an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is +treated as a no-op and mlock_fixup() simply returns. + +If the VMA passes some filtering as described in "Filtering Special VMAs" +below, mlock_fixup() will attempt to merge the VMA with its neighbors or split +off a subset of the VMA if the range does not cover the entire VMA. Any pages +already present in the VMA are then marked as mlocked by mlock_page() via +mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). + +Before returning from the system call, do_mlock() or mlockall() will call +__mm_populate() to fault in the remaining pages via get_user_pages() and to +mark those pages as mlocked as they are faulted. + +Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, +get_user_pages() will be unable to fault in the pages. That's okay. If pages +do end up getting faulted into this VM_LOCKED VMA, they will be handled in the +fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. + +For each PTE (or PMD) being faulted into a VMA, the page add rmap function +calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED +(unless it is a PTE mapping of a part of a transparent huge page). Or when +it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() +calls mlock_new_page() instead: similar to mlock_page(), but can make better +judgments, since this page is held exclusively and known not to be on LRU yet. + +mlock_page() sets PageMlocked immediately, then places the page on the CPU's +mlock pagevec, to batch up the rest of the work to be done under lru_lock by +__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count +and moves the page to unevictable state ("the unevictable LRU", but with +mlock_count in place of LRU threading). Or if the page was already PageLRU +and PageUnevictable and PageMlocked, it simply increments the mlock_count. + +But in practice that may not work ideally: the page may not yet be on an LRU, or +it may have been temporarily isolated from LRU. In such cases the mlock_count +field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() +returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: +rather than risk stranding a page indefinitely as unevictable, always err with +mlock_count on the low side, so that when munlocked the page will be rescued to +an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a +VM_LOCKED VMA. + + +Filtering Special VMAs +---------------------- + +mlock_fixup() filters several classes of "special" VMAs: + +1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind + these mappings are inherently pinned, so we don't need to mark them as + mlocked. In any case, most of the pages have no struct page in which to so + mark the page. Because of this, get_user_pages() will fail for these VMAs, + so there is no sense in attempting to visit them. + +2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We + neither need nor want to mlock() these pages. But __mm_populate() includes + hugetlbfs ranges, allocating the huge pages and populating the PTEs. + +3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, + such as the VDSO page, relay channel pages, etc. These pages are inherently + unevictable and are not managed on the LRU lists. __mm_populate() includes + these ranges, populating the PTEs if not already populated. + +4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate() + includes these ranges, populating the PTEs if not already populated. + +Note that for all of these special VMAs, mlock_fixup() does not set the +VM_LOCKED flag. Therefore, we won't have to deal with them later during +munlock(), munmap() or task exit. Neither does mlock_fixup() account these +VMAs against the task's "locked_vm". + + +munlock()/munlockall() System Call Handling +------------------------------------------- + +The munlock() and munlockall() system calls are handled by the same +mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are. +If called to munlock an already munlocked VMA, mlock_fixup() simply returns. +Because of the VMA filtering discussed above, VM_LOCKED will not be set in +any "special" VMAs. So, those VMAs will be ignored for munlock. + +If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the +specified range. All pages in the VMA are then munlocked by munlock_page() via +mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same +function used when mlocking a VMA range, with new flags for the VMA indicating +that it is munlock() being performed. + +munlock_page() uses the mlock pagevec to batch up work to be done under +lru_lock by __munlock_page(). __munlock_page() decrements the page's +mlock_count, and when that reaches 0 it clears PageMlocked and clears +PageUnevictable, moving the page from unevictable state to inactive LRU. + +But in practice that may not work ideally: the page may not yet have reached +"the unevictable LRU", or it may have been temporarily isolated from it. In +those cases its mlock_count field is unusable and must be assumed to be 0: so +that the page will be rescued to an evictable LRU, then perhaps be mlocked +again later if vmscan finds it in a VM_LOCKED VMA. + + +Migrating MLOCKED Pages +----------------------- + +A page that is being migrated has been isolated from the LRU lists and is held +locked across unmapping of the page, updating the page's address space entry +and copying the contents and state, until the page table entry has been +replaced with an entry that refers to the new page. Linux supports migration +of mlocked pages and other unevictable pages. PG_mlocked is cleared from the +the old page when it is unmapped from the last VM_LOCKED VMA, and set when the +new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page +was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the +page was unevictable for other reasons, PG_unevictable is copied explicitly. + +Note that page migration can race with mlocking or munlocking of the same page. +There is mostly no problem since page migration requires unmapping all PTEs of +the old page (including munlock where VM_LOCKED), then mapping in the new page +(including mlock where VM_LOCKED). The page table locks provide sufficient +synchronization. + +However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, +before mlocking any pages already present, if one of those pages were migrated +before mlock_pte_range() reached it, it would get counted twice in mlock_count. +To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, +so that mlock_vma_page() will skip it. + +To complete page migration, we place the old and new pages back onto the LRU +afterwards. The "unneeded" page - old page on success, new page on failure - +is freed when the reference count held by the migration process is released. + + +Compacting MLOCKED Pages +------------------------ + +The memory map can be scanned for compactable regions and the default behavior +is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed +controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work +of compaction is mostly handled by the page migration code and the same work +flow as described in Migrating MLOCKED Pages will apply. + + +MLOCKING Transparent Huge Pages +------------------------------- + +A transparent huge page is represented by a single entry on an LRU list. +Therefore, we can only make unevictable an entire compound page, not +individual subpages. + +If a user tries to mlock() part of a huge page, and no user mlock()s the +whole of the huge page, we want the rest of the page to be reclaimable. + +We cannot just split the page on partial mlock() as split_huge_page() can +fail and a new intermittent failure mode for the syscall is undesirable. + +We handle this by keeping PTE-mlocked huge pages on evictable LRU lists: +the PMD on the border of a VM_LOCKED VMA will be split into a PTE table. + +This way the huge page is accessible for vmscan. Under memory pressure the +page will be split, subpages which belong to VM_LOCKED VMAs will be moved +to the unevictable LRU and the rest can be reclaimed. + +/proc/meminfo's Unevictable and Mlocked amounts do not include those parts +of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs. + + +mmap(MAP_LOCKED) System Call Handling +------------------------------------- + +In addition to the mlock(), mlock2() and mlockall() system calls, an application +can request that a region of memory be mlocked by supplying the MAP_LOCKED flag +to the mmap() call. There is one important and subtle difference here, though. +mmap() + mlock() will fail if the range cannot be faulted in (e.g. because +mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. +The mmaped area will still have properties of the locked area - pages will not +get swapped out - but major page faults to fault memory in might still happen. + +Furthermore, any mmap() call or brk() call that expands the heap by a task +that has previously called mlockall() with the MCL_FUTURE flag will result +in the newly mapped memory being mlocked. Before the unevictable/mlock +changes, the kernel simply called make_pages_present() to allocate pages +and populate the page table. + +To mlock a range of memory under the unevictable/mlock infrastructure, +the mmap() handler and task address space expansion functions call +populate_vma_page_range() specifying the vma and the address range to mlock. + + +munmap()/exit()/exec() System Call Handling +------------------------------------------- + +When unmapping an mlocked region of memory, whether by an explicit call to +munmap() or via an internal unmap from exit() or exec() processing, we must +munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. +Before the unevictable/mlock changes, mlocking did not mark the pages in any +way, so unmapping them required no processing. + +For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls +munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +(unless it was a PTE mapping of a part of a transparent huge page). + +munlock_page() uses the mlock pagevec to batch up work to be done under +lru_lock by __munlock_page(). __munlock_page() decrements the page's +mlock_count, and when that reaches 0 it clears PageMlocked and clears +PageUnevictable, moving the page from unevictable state to inactive LRU. + +But in practice that may not work ideally: the page may not yet have reached +"the unevictable LRU", or it may have been temporarily isolated from it. In +those cases its mlock_count field is unusable and must be assumed to be 0: so +that the page will be rescued to an evictable LRU, then perhaps be mlocked +again later if vmscan finds it in a VM_LOCKED VMA. + + +Truncating MLOCKED Pages +------------------------ + +File truncation or hole punching forcibly unmaps the deleted pages from +userspace; truncation even unmaps and deletes any private anonymous pages +which had been Copied-On-Write from the file pages now being truncated. + +Mlocked pages can be munlocked and deleted in this way: like with munmap(), +for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls +munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +(unless it was a PTE mapping of a part of a transparent huge page). + +However, if there is a racing munlock(), since mlock_vma_pages_range() starts +munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages +present, if one of those pages were unmapped by truncation or hole punch before +mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, +and would not be counted out of mlock_count. In this rare case, a page may +still appear as PageMlocked after it has been fully unmapped: and it is left to +release_pages() (or __page_cache_release()) to clear it and update statistics +before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, +which is usually 0). + + +Page Reclaim in shrink_*_list() +------------------------------- + +vmscan's shrink_active_list() culls any obviously unevictable pages - +i.e. !page_evictable(page) pages - diverting those to the unevictable list. +However, shrink_active_list() only sees unevictable pages that made it onto the +active/inactive LRU lists. Note that these pages do not have PageUnevictable +set - otherwise they would be on the unevictable list and shrink_active_list() +would never see them. + +Some examples of these unevictable pages on the LRU lists are: + + (1) ramfs pages that have been placed on the LRU lists when first allocated. + + (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to + allocate or fault in the pages in the shared memory region. This happens + when an application accesses the page the first time after SHM_LOCK'ing + the segment. + + (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, + but events left mlock_count too low, so they were munlocked too early. + +vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously +unevictable pages found on the inactive lists to the appropriate memory cgroup +and node unevictable list. + +rmap's page_referenced_one(), called via vmscan's shrink_active_list() or +shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), +check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() +to correct them. Such pages are culled to the unevictable list when released +by the shrinker. diff --git a/Documentation/mm/vmalloc.rst b/Documentation/mm/vmalloc.rst new file mode 100644 index 000000000000..363fe20d6b9f --- /dev/null +++ b/Documentation/mm/vmalloc.rst @@ -0,0 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================== +Virtually Contiguous Memory Allocation +====================================== diff --git a/Documentation/mm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst new file mode 100644 index 000000000000..fc8c67833af6 --- /dev/null +++ b/Documentation/mm/vmalloced-kernel-stacks.rst @@ -0,0 +1,153 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Virtually Mapped Kernel Stack Support +===================================== + +:Author: Shuah Khan + +.. contents:: :local: + +Overview +-------- + +This is a compilation of information from the code and original patch +series that introduced the `Virtually Mapped Kernel Stacks feature +` + +Introduction +------------ + +Kernel stack overflows are often hard to debug and make the kernel +susceptible to exploits. Problems could show up at a later time making +it difficult to isolate and root-cause. + +Virtually-mapped kernel stacks with guard pages causes kernel stack +overflows to be caught immediately rather than causing difficult to +diagnose corruptions. + +HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable +support for virtually mapped stacks with guard pages. This feature +causes reliable faults when the stack overflows. The usability of +the stack trace after overflow and response to the overflow itself +is architecture dependent. + +.. note:: + As of this writing, arm64, powerpc, riscv, s390, um, and x86 have + support for VMAP_STACK. + +HAVE_ARCH_VMAP_STACK +-------------------- + +Architectures that can support Virtually Mapped Kernel Stacks should +enable this bool configuration option. The requirements are: + +- vmalloc space must be large enough to hold many kernel stacks. This + may rule out many 32-bit architectures. +- Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. +- If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +VMAP_STACK +---------- + +VMAP_STACK bool configuration option when enabled allocates virtually +mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. + +- Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be caught + immediately rather than causing difficult-to-diagnose corruption. + +.. note:: + + Using this feature with KASAN requires architecture support + for backing virtual mappings with real shadow memory, and + KASAN_VMALLOC must be enabled. + +.. note:: + + VMAP_STACK is enabled, it is not possible to run DMA on stack + allocated data. + +Kernel configuration options and dependencies keep changing. Refer to +the latest code base: + +`Kconfig ` + +Allocation +----------- + +When a new kernel thread is created, thread stack is allocated from +virtually contiguous memory pages from the page level allocator. These +pages are mapped into contiguous kernel virtual space with PAGE_KERNEL +protections. + +alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack +with PAGE_KERNEL protections. + +- Allocated stacks are cached and later reused by new threads, so memcg + accounting is performed manually on assigning/releasing stacks to tasks. + Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. +- vm_struct is cached to be able to find when thread free is initiated + in interrupt context. free_thread_stack() can be called in interrupt + context. +- On arm64, all VMAP's stacks need to have the same alignment to ensure + that VMAP'd stack overflow detection works correctly. Arch specific + vmap stack allocator takes care of this detail. +- This does not address interrupt stacks - according to the original patch + +Thread stack allocation is initiated from clone(), fork(), vfork(), +kernel_thread() via kernel_clone(). Leaving a few hints for searching +the code base to understand when and how thread stack is allocated. + +Bulk of the code is in: +`kernel/fork.c `. + +stack_vm_area pointer in task_struct keeps track of the virtually allocated +stack and a non-null stack_vm_area pointer serves as a indication that the +virtually mapped kernel stacks are enabled. + +:: + + struct vm_struct *stack_vm_area; + +Stack overflow handling +----------------------- + +Leading and trailing guard pages help detect stack overflows. When stack +overflows into the guard pages, handlers have to be careful not overflow +the stack again. When handlers are called, it is likely that very little +stack space is left. + +On x86, this is done by handling the page fault indicating the kernel +stack overflow on the double-fault stack. + +Testing VMAP allocation with guard pages +---------------------------------------- + +How do we ensure that VMAP_STACK is actually allocating with a leading +and trailing guard page? The following lkdtm tests can help detect any +regressions. + +:: + + void lkdtm_STACK_GUARD_PAGE_LEADING() + void lkdtm_STACK_GUARD_PAGE_TRAILING() + +Conclusions +----------- + +- A percpu cache of vmalloced stacks appears to be a bit faster than a + high-order stack allocation, at least when the cache hits. +- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and + simply embed the thread_info (containing only flags) and 'int cpu' into + task_struct. +- The thread stack can be free'ed as soon as the task is dead (without + waiting for RCU) and then, if vmapped stacks are in use, cache the + entire stack for reuse on the same cpu. diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst new file mode 100644 index 000000000000..c9c495f62d12 --- /dev/null +++ b/Documentation/mm/vmemmap_dedup.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +A vmemmap diet for HugeTLB and Device DAX +========================================= + +HugeTLB +======= + +The struct page structures (page structs) are used to describe a physical +page frame. By default, there is a one-to-one mapping from a page frame to +it's corresponding page struct. + +HugeTLB pages consist of multiple base page size pages and is supported by many +architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more +details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are +currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page +consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. +For each base page, there is a corresponding page struct. + +Within the HugeTLB subsystem, only the first 4 page structs are used to +contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides +this upper limit. The only 'useful' information in the remaining page structs +is the compound_head field, and this field is the same for all tail pages. + +By removing redundant page structs for HugeTLB pages, memory can be returned +to the buddy allocator for other uses. + +Different architectures support different HugeTLB pages. For example, the +following table is the HugeTLB page size supported by x86 and arm64 +architectures. Because arm64 supports 4k, 16k, and 64k base pages and +supports contiguous entries, so it supports many kinds of sizes of HugeTLB +page. + ++--------------+-----------+-----------------------------------------------+ +| Architecture | Page Size | HugeTLB Page Size | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| x86-64 | 4KB | 2MB | 1GB | | | ++--------------+-----------+-----------+-----------+-----------+-----------+ +| | 4KB | 64KB | 2MB | 32MB | 1GB | +| +-----------+-----------+-----------+-----------+-----------+ +| arm64 | 16KB | 2MB | 32MB | 1GB | | +| +-----------+-----------+-----------+-----------+-----------+ +| | 64KB | 2MB | 512MB | 16GB | | ++--------------+-----------+-----------+-----------+-----------+-----------+ + +When the system boot up, every HugeTLB page has more than one struct page +structs which size is (unit: pages):: + + struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + +Where HugeTLB_Size is the size of the HugeTLB page. We know that the size +of the HugeTLB page is always n times PAGE_SIZE. So we can get the following +relationship:: + + HugeTLB_Size = n * PAGE_SIZE + +Then:: + + struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE + = n * sizeof(struct page) / PAGE_SIZE + +We can use huge mapping at the pud/pmd level for the HugeTLB page. + +For the HugeTLB page of the pmd level mapping, then:: + + struct_size = n * sizeof(struct page) / PAGE_SIZE + = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE + = sizeof(struct page) / sizeof(pte_t) + = 64 / 8 + = 8 (pages) + +Where n is how many pte entries which one page can contains. So the value of +n is (PAGE_SIZE / sizeof(pte_t)). + +This optimization only supports 64-bit system, so the value of sizeof(pte_t) +is 8. And this optimization also applicable only when the size of struct page +is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the +size of struct page structs of it is 8 page frames which size depends on the +size of the base page. + +For the HugeTLB page of the pud level mapping, then:: + + struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) + = PAGE_SIZE / 8 * 8 (pages) + = PAGE_SIZE (pages) + +Where the struct_size(pmd) is the size of the struct page structs of a +HugeTLB page of the pmd level mapping. + +E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB +HugeTLB page consists in 4096. + +Next, we take the pmd level mapping of the HugeTLB page as an example to +show the internal implementation of this optimization. There are 8 pages +struct page structs associated with a HugeTLB page which is pmd mapped. + +Here is how things look before optimization:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | -------------> | 2 | + | | +-----------+ +-----------+ + | | | 3 | -------------> | 3 | + | | +-----------+ +-----------+ + | | | 4 | -------------> | 4 | + | PMD | +-----------+ +-----------+ + | level | | 5 | -------------> | 5 | + | mapping | +-----------+ +-----------+ + | | | 6 | -------------> | 6 | + | | +-----------+ +-----------+ + | | | 7 | -------------> | 7 | + | | +-----------+ +-----------+ + | | + | | + | | + +-----------+ + +The value of page->compound_head is the same for all tail pages. The first +page of page structs (page 0) associated with the HugeTLB page contains the 4 +page structs necessary to describe the HugeTLB. The only use of the remaining +pages of page structs (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +will be used for each HugeTLB page. This will allow us to free the remaining +7 pages to the buddy allocator. + +Here is how things look after remapping:: + + HugeTLB struct pages(8 pages) page frame(8 pages) + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | | + | | | 2 | -----------------+ | | | | | + | | +-----------+ | | | | | + | | | 3 | -------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | ---------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | -----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | -------------------------+ | + | | +-----------+ | + | | | 7 | ---------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +When a HugeTLB is freed to the buddy system, we should allocate 7 pages for +vmemmap pages and restore the previous mapping relationship. + +For the HugeTLB page of the pud level mapping. It is similar to the former. +We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. + +Apart from the HugeTLB page of the pmd/pud level mapping, some architectures +(e.g. aarch64) provides a contiguous bit in the translation table entries +that hints to the MMU to indicate that it is one of a contiguous set of +entries that can be cached in a single TLB entry. + +The contiguous bit is used to increase the mapping size at the pmd and pte +(last) level. So this type of HugeTLB page can be optimized only when its +size of the struct page structs is greater than 1 page. + +Notice: The head vmemmap page is not freed to the buddy allocator and all +tail vmemmap pages are mapped to the head vmemmap page frame. So we can see +more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) +associated with each HugeTLB page. The compound_head() can handle this +correctly (more details refer to the comment above compound_head()). + +Device DAX +========== + +The device-dax interface uses the same tail deduplication technique explained +in the previous chapter, except when used with the vmemmap in +the device (altmap). + +The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), +PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). + +The differences with HugeTLB are relatively minor. + +It only use 3 page structs for storing all information as opposed +to 4 on HugeTLB pages. + +There's no remapping of vmemmap given that device-dax memory is not part of +System RAM ranges initialized at boot. Thus the tail page deduplication +happens at a later stage when we populate the sections. HugeTLB reuses the +the head vmemmap page representing, whereas device-dax reuses the tail +vmemmap page. This results in only half of the savings compared to HugeTLB. + +Deduplicated tail pages are not mapped read-only. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | ----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | ------------------------+ | + | | +-----------+ | + | | | 7 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ diff --git a/Documentation/mm/z3fold.rst b/Documentation/mm/z3fold.rst new file mode 100644 index 000000000000..224e3c61d686 --- /dev/null +++ b/Documentation/mm/z3fold.rst @@ -0,0 +1,30 @@ +.. _z3fold: + +====== +z3fold +====== + +z3fold is a special purpose allocator for storing compressed pages. +It is designed to store up to three compressed pages per physical page. +It is a zbud derivative which allows for higher compression +ratio keeping the simplicity and determinism of its predecessor. + +The main differences between z3fold and zbud are: + +* unlike zbud, z3fold allows for up to PAGE_SIZE allocations +* z3fold can hold up to 3 compressed pages in its page +* z3fold doesn't export any API itself and is thus intended to be used + via the zpool API. + +To keep the determinism and simplicity, z3fold, just like zbud, always +stores an integral number of compressed pages per page, but it can store +up to 3 pages unlike zbud which can store at most 2. Therefore the +compression ratio goes to around 2.7x while zbud's one is around 1.7x. + +Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not +return a dereferenceable pointer. Instead, it returns an unsigned long +handle which encodes actual location of the allocated object. + +Keeping effective compression ratio close to zsmalloc's, z3fold doesn't +depend on MMU enabled and provides more predictable reclaim behavior +which makes it a better fit for small and response-critical systems. diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst new file mode 100644 index 000000000000..6e79893d6132 --- /dev/null +++ b/Documentation/mm/zsmalloc.rst @@ -0,0 +1,82 @@ +.. _zsmalloc: + +======== +zsmalloc +======== + +This allocator is designed for use with zram. Thus, the allocator is +supposed to work well under low memory conditions. In particular, it +never attempts higher order page allocation which is very likely to +fail under memory pressure. On the other hand, if we just use single +(0-order) pages, it would suffer from very high fragmentation -- +any object of size PAGE_SIZE/2 or larger would occupy an entire page. +This was one of the major issues with its predecessor (xvmalloc). + +To overcome these issues, zsmalloc allocates a bunch of 0-order pages +and links them together using various 'struct page' fields. These linked +pages act as a single higher-order page i.e. an object can span 0-order +page boundaries. The code refers to these linked pages as a single entity +called zspage. + +For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE +since this satisfies the requirements of all its current users (in the +worst case, page is incompressible and is thus stored "as-is" i.e. in +uncompressed form). For allocation requests larger than this size, failure +is returned (see zs_malloc). + +Additionally, zs_malloc() does not return a dereferenceable pointer. +Instead, it returns an opaque handle (unsigned long) which encodes actual +location of the allocated object. The reason for this indirection is that +zsmalloc does not keep zspages permanently mapped since that would cause +issues on 32-bit systems where the VA region for kernel space mappings +is very small. So, before using the allocating memory, the object has to +be mapped using zs_map_object() to get a usable pointer and subsequently +unmapped using zs_unmap_object(). + +stat +==== + +With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via +``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + index +size + object size zspage stores +almost_empty + the number of ZS_ALMOST_EMPTY zspages(see below) +almost_full + the number of ZS_ALMOST_FULL zspages(see below) +obj_allocated + the number of objects allocated +obj_used + the number of objects allocated to the user +pages_used + the number of pages allocated for the class +pages_per_zspage + the number of 0-order pages to make a zspage + +We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where + +* n = number of allocated objects +* N = total number of objects zspage can store +* f = fullness_threshold_frac(ie, 4 at the moment) + +Similarly, we assign zspage to: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst index 0c8276109fc0..30c69e1f44fe 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst @@ -13,7 +13,7 @@ 监测数据访问 ============ -:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, +:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, 用户可以分析他们系统的内存访问模式,并优化它们。 .. toctree:: diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst index 1500bdbf338a..c976f3e33ffd 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst @@ -229,4 +229,4 @@ DAMON_RECLAIM再次什么都不做,这样我们就可以退回到基于LRU列 .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index eee0e8c5c368..cd41ada4fdad 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -33,9 +33,9 @@ DAMON 为不同的用户提供了下面这些接口。 口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到 :ref:`sysfs interface `。 - *内核空间编程接口。* - :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 + :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。 - 详细情况请参考接口 :doc:`文件 `。 + 详细情况请参考接口 :doc:`文件 `。 sysfs接口 ========= @@ -148,7 +148,7 @@ contexts//monitoring_attrs/ 在 ``nr_regions`` 目录下,有两个文件分别用于DAMON监测区域的下限和上限(``min`` 和 ``max`` ), 这两个文件控制着监测的开销。你可以通过向这些文件的写入和读出来设置和获取这些值。 -关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/vm/damon/design`)。 +关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/mm/damon/design`)。 contexts//targets/ --------------------- @@ -318,7 +318,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, ---- 用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔`` -以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/vm/damon/design` 。例如, +以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如, 下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查:: # cd /damon diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst index 26d9913fc8b6..b03020c8b2ab 100644 --- a/Documentation/translations/zh_CN/core-api/index.rst +++ b/Documentation/translations/zh_CN/core-api/index.rst @@ -101,7 +101,7 @@ Todolist: ======== 如何在内核中分配和使用内存。请注意,在 -:doc:`/vm/index` 中有更多的内存管理文档。 +:doc:`/mm/index` 中有更多的内存管理文档。 .. toctree:: :maxdepth: 1 diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index ad7bb8c17562..bf85baca8b3e 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -118,7 +118,7 @@ TODOList: sound/index filesystems/index scheduler/index - vm/index + mm/index peci/index TODOList: diff --git a/Documentation/translations/zh_CN/mm/active_mm.rst b/Documentation/translations/zh_CN/mm/active_mm.rst new file mode 100644 index 000000000000..c2816f523bd7 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/active_mm.rst @@ -0,0 +1,85 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/active_mm.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +========= +Active MM +========= + +这是一封linux之父回复开发者的一封邮件,所以翻译时我尽量保持邮件格式的完整。 + +:: + + List: linux-kernel + Subject: Re: active_mm + From: Linus Torvalds + Date: 1999-07-30 21:36:24 + + 因为我并不经常写解释,所以已经抄送到linux-kernel邮件列表,而当我做这些, + 且更多的人在阅读它们时,我觉得棒极了。 + + 1999年7月30日 星期五, David Mosberger 写道: + > + > 是否有一个简短的描述,说明task_struct中的 + > "mm" 和 "active_mm"应该如何使用? (如果 + > 这个问题在邮件列表中讨论过,我表示歉意--我刚 + > 刚度假回来,有一段时间没能关注linux-kernel了)。 + + 基本上,新的设定是: + + - 我们有“真实地址空间”和“匿名地址空间”。区别在于,匿名地址空间根本不关心用 + 户级页表,所以当我们做上下文切换到匿名地址空间时,我们只是让以前的地址空间 + 处于活动状态。 + + 一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线 + 程基本上都属于这一类,但即使是“真正的”线程也可以暂时说在一定时间内它们不 + 会对用户空间感兴趣,调度器不妨试着避免在切换VM状态上浪费时间。目前只有老 + 式的bdflush sync能做到这一点。 + + - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说,tsk->mm将是NULL, + 其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。 + + - 然而,我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此,我们 + 有 “tsk->active_mm”,它显示了当前活动的地址空间是什么。 + + 规则是,对于一个有真实地址空间的进程(即tsk->mm是 non-NULL),active_mm + 显然必须与真实的mm相同。 + + 对于一个匿名进程,tsk->mm == NULL,而tsk->active_mm是匿名进程运行时 + “借用”的mm。当匿名进程被调度走时,借用的地址空间被返回并清除。 + + 为了支持所有这些,“struct mm_struct”现在有两个计数器:一个是 “mm_users” + 计数器,即有多少 “真正的地址空间用户”,另一个是 “mm_count”计数器,即 “lazy” + 用户(即匿名用户)的数量,如果有任何真正的用户,则加1。 + + 通常情况下,至少有一个真正的用户,但也可能是真正的用户在另一个CPU上退出,而 + 一个lazy的用户仍在活动,所以你实际上得到的情况是,你有一个地址空间 **只** + 被lazy的用户使用。这通常是一个短暂的生命周期状态,因为一旦这个线程被安排给一 + 个真正的线程,这个 “僵尸” mm就会被释放,因为 “mm_count”变成了零。 + + 另外,一个新的规则是,**没有人** 再把 “init_mm” 作为一个真正的MM了。 + “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”,事实上,它主 + 要是在启动时使用,当时还没有真正的VM被创建。因此,用来检查的代码 + + if (current->mm == &init_mm) + + 一般来说,应该用 + + if (!current->mm) + + 取代上面的写法(这更有意义--测试基本上是 “我们是否有一个用户环境”,并且通常 + 由缺页异常处理程序和类似的东西来完成)。 + + 总之,我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1,因为它稍微改 + 变了接口以适配alpha(谁会想到呢,但alpha体系结构上下文切换代码实际上最终是 + 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的,alpha的PALcode将两者 + 连接起来,你需要同时切换两者)。 + + (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/translations/zh_CN/mm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst new file mode 100644 index 000000000000..6fd79209c307 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/balance.rst @@ -0,0 +1,81 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/balance.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +内存平衡 +======== + +2000年1月开始,作者:Kanoj Sarcar + +对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配,需要进行 +内存平衡。 + +调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个 +原因可能是,调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退 +选项的机会主义高阶分配请求中。在这种情况下,调用者可能也希望避免唤醒kswapd。 + +__GFP_IO分配请求是为了防止文件系统死锁。 + +在没有非睡眠分配请求的情况下,做平衡似乎是有害的。页面回收可以被懒散地启动,也就是 +说,只有在需要的时候(也就是区域的空闲内存为0),而不是让它成为一个主动的过程。 + +也就是说,内核应该尝试从直接映射池中满足对直接映射页的请求,而不是回退到dma池中, +这样就可以保持dma池为dma请求(不管是不是原子的)所填充。类似的争论也适用于高内存 +和直接映射的页面。相反,如果有很多空闲的dma页,最好是通过从dma池中分配一个来满足 +常规的内存请求,而不是产生常规区域平衡的开销。 + +在2.2中,只有当空闲页总数低于总内存的1/64时,才会启动内存平衡/页面回收。如果dma +和常规内存的比例合适,即使dma区完全空了,也很可能不会进行平衡。2.2已经在不同内存 +大小的生产机器上运行,即使有这个问题存在,似乎也做得不错。在2.3中,由于HIGHMEM的 +存在,这个问题变得更加严重。 + +在2.3中,区域平衡可以用两种方式之一来完成:根据区域的大小(可能是低级区域的大小), +我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是,在平衡的时 +候,我们不需要看低级区的大小,坏的方面是,我们可能会因为忽略低级区可能较低的使用率 +而做过于频繁的平衡。另外,只要对分配程序稍作修改,就有可能将memclass()宏简化为一 +个简单的等式。 + +另一个可能的解决方案是,我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其 +低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题,并尽可能地保持了与2.2行为 +的接近。另外,平衡算法在各种架构上的工作方式也是一样的,这些架构有不同数量和类型的 +内存区。如果我们想变得更花哨一点,我们可以在未来为不同区域的自由页面分配不同的权重。 + +请注意,如果普通区的大小与dma区相比是巨大的,那么在决定是否平衡普通区的时候,考虑 +空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。 + +所附的补丁实现了第二个解决方案。它还 “修复”了两个问题:首先,在低内存条件下,kswapd +被唤醒,就像2.2中的非睡眠分配。第二,HIGHMEM区也被平衡了,以便给replace_with_highmem() +一个争取获得HIGHMEM页的机会,同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM +页不会被泄露(例如,在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下)。 + +kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的,可能 +是因为所有的分配请求都来自中断上下文,而所有的进程上下文都在睡眠。对于2.3, +kswapd并不真正需要平衡高内存区,因为中断上下文并不请求高内存页。kswapd看zone +结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。 + +如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力,而该区的内存压力 +已经低于其水位,则会进行偷取。 + +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: +这些是每个区的字段,用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时, +hysteric 的字段low_on_memory被设置。这个字段会一直被设置,直到空闲页数变成水位 +[WMARK_HIGH]。当low_on_memory被设置时,页面分配请求将尝试释放该区域的一些页面(如果 +请求中设置了GFP_WAIT)。与此相反的是,决定唤醒kswapd以释放一些区的页。这个决定不是基于 +hysteresis 的,而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行;在这种情况下, +zone_wake_kswapd也被设置。 + + +我所听到的(超棒的)想法: + +1. 动态经历应该影响平衡:可以跟踪一个区的失败请求的数量,并反馈到平衡方案中(jalvo@mbay.net)。 + +2. 实现一个类似于replace_with_highmem()的replace_with_regular(),以保留dma页面。 + (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/translations/zh_CN/mm/damon/api.rst b/Documentation/translations/zh_CN/mm/damon/api.rst new file mode 100644 index 000000000000..5593a83c86bc --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/api.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/api.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======= +API参考 +======= + +内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` , +它位于源代码树的include/linux/。 + +结构体 +====== + +该API在以下内核代码中: + +include/linux/damon.h + + +函数 +==== + +该API在以下内核代码中: + +mm/damon/core.c diff --git a/Documentation/translations/zh_CN/mm/damon/design.rst b/Documentation/translations/zh_CN/mm/damon/design.rst new file mode 100644 index 000000000000..16e3db34a7dd --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/design.rst @@ -0,0 +1,140 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/design.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +==== +设计 +==== + +可配置的层 +========== + +DAMON提供了数据访问监控功能,同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间 +并为之优化的基元。另一方面,作为DAMON的核心,准确性和开销的权衡机制是在纯逻辑空间中。DAMON +将这两部分分离在不同的层中,并定义了它的接口,以允许各种低层次的基元实现与核心逻辑的配置。 + +由于这种分离的设计和可配置的接口,用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的 +任何地址空间。如果没有提供合适的,用户可以自己实现基元。 + +例如,物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。 +另外,如果某些架构或设备支持特殊的优化访问检查基元,这些基元将很容易被配置。 + + +特定地址空间基元的参考实现 +========================== + +基本访问监测的低级基元被定义为两部分。: + +1. 确定地址空间的监测目标地址范围 +2. 目标空间中特定地址范围的访问检查。 + +DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。 + + +基于VMA的目标地址范围构造 +------------------------- + +这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间,只是要求用户手动设置监控目标地址范围。 + +在进程的超级巨大的虚拟地址空间中,只有小部分被映射到物理内存并被访问。因此,跟踪未映射的地 +址区域只是一种浪费。然而,由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声,所以严 +格来说,跟踪每一个映射并不是必须的,但在某些情况下甚至会产生很高的开销。也就是说,监测目标 +内部过于巨大的未映射区域应该被移除,以不占用自适应机制的时间。 + +出于这个原因,这个实现将复杂的映射转换为三个不同的区域,覆盖地址空间的每个映射区域。这三个 +区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面 +的mmap()区域之间的间隙,以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙 +在通常的地址空间中是异常巨大的,排除这些间隙就足以做出合理的权衡。下面详细说明了这一点:: + + + + + (small mmap()-ed regions and munmap()-ed regions) + + + + + +基于PTE访问位的访问检查 +----------------------- + +物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中 +找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表,而物理地址的实现则 +是查找与该地址有映射关系的每一个页表。通过这种方式,实现者找到并清除下一个采样目标地址的位, +并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统,即空闲页跟 +踪和回收逻辑。为了避免这种干扰,DAMON使其与空闲页面跟踪相互排斥,并使用 ``PG_idle`` 和 +``PG_young`` 页面标志来解决与回收逻辑的冲突,就像空闲页面跟踪那样。 + + +独立于地址空间的核心机制 +======================== + +下面四个部分分别描述了DAMON的核心机制和五个监测属性,即 ``采样间隔`` 、 ``聚集间隔`` 、 +``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。 + + +访问频率监测 +------------ + +DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置 +``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说,DAMON检查每个 ``采样间隔`` 对每 +个页面的访问,并将结果汇总。换句话说,计算每个页面的访问次数。在每个 ``聚合间隔`` 过 +去后,DAMON调用先前由用户注册的回调函数,以便用户可以阅读聚合的结果,然后再清除这些结 +果。这可以用以下简单的伪代码来描述:: + + while monitoring_on: + for page in monitoring_target: + if accessed(page): + nr_accesses[page] += 1 + if time() % aggregation_interval == 0: + for callback in user_registered_callbacks: + callback(monitoring_target, nr_accesses) + for page in monitoring_target: + nr_accesses[page] = 0 + sleep(sampling interval) + +这种机制的监测开销将随着目标工作负载规模的增长而任意增加。 + + +基于区域的抽样调查 +------------------ + +为了避免开销的无限制增加,DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持 +这个假设(一个区域内的页面具有相同的访问频率),该区域内就只需要检查一个页面。因此,对 +于每个 ``采样间隔`` ,DAMON在每个区域中随机挑选一个页面,等待一个 ``采样间隔`` ,检 +查该页面是否同时被访问,如果被访问则增加该区域的访问频率。因此,监测开销是可以通过设置 +区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。 + +然而,如果假设没有得到保证,这个方案就不能保持输出的质量。 + + +适应性区域调整 +-------------- + +即使最初的监测目标区域被很好地构建以满足假设(同一区域内的页面具有相似的访问频率),数 +据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设,DAMON根据每个 +区域的访问频率自适应地进行合并和拆分。 + +对于每个 ``聚集区间`` ,它比较相邻区域的访问频率,如果频率差异较小,就合并这些区域。 +然后,在它报告并清除每个区域的聚合接入频率后,如果区域总数不超过用户指定的最大区域数, +它将每个区域拆分为两个或三个区域。 + +通过这种方式,DAMON提供了其最佳的质量和最小的开销,同时保持了用户为其权衡设定的界限。 + + +动态目标空间更新处理 +-------------------- + +监测目标地址范围可以动态改变。例如,虚拟内存可以动态地被映射和解映射。物理内存可以被 +热插拔。 + +由于在某些情况下变化可能相当频繁,DAMON允许监控操作检查动态变化,包括内存映射变化, +并仅在用户指定的时间间隔( ``更新间隔`` )中的每个时间段,将其应用于监控操作相关的 +数据结构,如抽象的监控目标内存区。 \ No newline at end of file diff --git a/Documentation/translations/zh_CN/mm/damon/faq.rst b/Documentation/translations/zh_CN/mm/damon/faq.rst new file mode 100644 index 000000000000..de4be417494a --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/faq.rst @@ -0,0 +1,48 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/faq.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +常见问题 +======== + +为什么是一个新的子系统,而不是扩展perf或其他用户空间工具? +========================================================== + +首先,因为它需要尽可能的轻量级,以便可以在线使用,所以应该避免任何不必要的开销,如内核-用户 +空间的上下文切换成本。第二,DAMON的目标是被包括内核在内的其他程序所使用。因此,对特定工具 +(如perf)的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。 + + +“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗? +============================================== + +闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的,尽管它可以 +使用采样来减少开销。另一方面,DAMON是一个更高层次的框架,用于监控各种地址空间。它专注于内 +存管理优化,并提供复杂的精度/开销处理机制。因此,“空闲页面跟踪” 和 “perf mem” 可以提供 +DAMON输出的一个子集,但不能替代DAMON。 + + +DAMON是否只支持虚拟内存? +========================= + +不,DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始 +部分,包括监测目标区域的构造和实际的访问检查。通过这种方式,DAMON用户可以用任何访问检查技 +术来监测任何地址空间。 + +尽管如此,DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间 +相关功能的实现,以供参考和方便使用。 + + +我可以简单地监测页面的粒度吗? +============================== + +是的,你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。 +因为监视目标区域的大小被强制为 ``>=page size`` ,所以区域分割不会产生任何影响。 diff --git a/Documentation/translations/zh_CN/mm/damon/index.rst b/Documentation/translations/zh_CN/mm/damon/index.rst new file mode 100644 index 000000000000..b03bf307204f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/damon/index.rst @@ -0,0 +1,32 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/damon/index.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +========================== +DAMON:数据访问监视器 +========================== + +DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 +(该核心机制详见(Documentation/translations/zh_CN/mm/damon/design.rst)) + + - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), + - *轻量级* (监控开销低到可以在线应用),以及 + - *可扩展* (无论目标工作负载的大小,开销的上限值都在恒定范围内)。 + +因此,利用这个框架,内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实 +验性内存管理优化工作可以再次进行。同时,在用户空间,有一些特殊工作负载的用户可以编写 +个性化的应用程序,以便更好地了解和优化他们的工作负载和系统。 + +.. toctree:: + :maxdepth: 2 + + faq + design + api diff --git a/Documentation/translations/zh_CN/mm/free_page_reporting.rst b/Documentation/translations/zh_CN/mm/free_page_reporting.rst new file mode 100644 index 000000000000..83b14cce9adf --- /dev/null +++ b/Documentation/translations/zh_CN/mm/free_page_reporting.rst @@ -0,0 +1,38 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/_free_page_reporting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========== +空闲页报告 +========== + +空闲页报告是一个API,设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟 +化的情况下是很有用的,客户机能够使用这些数据来通知管理器它不再使用内存中的某些页 +面。 + +对于驱动,通常是气球驱动要使用这个功能,它将分配和初始化一个page_reporting_dev_info +结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必 +须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。 +假设没有其他页面报告设备已经注册, 对page_reporting_register的调用将向报告框 +架注册页面报告接口。 + +一旦注册,页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告 +页面,并在任何足够高的页面被释放之后2秒继续报告。 + +报告的页面将被存储在传递给报告函数的散列表中,最后一个条目的结束位被设置在条目 +nent-1中。 当页面被报告函数处理时,分配器将无法访问它们。一旦报告函数完成,这些 +页将被返回到它们所获得的自由区域。 + +在移除使用空闲页报告的驱动之前,有必要调用page_reporting_unregister,以移除 +目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报 +告通过该接口发出。如果另一个驱动或同一驱动被注册,它就有可能恢复前一个驱动在报告 +空闲页方面的工作。 + + +Alexander Duyck, 2019年12月04日 diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst new file mode 100644 index 000000000000..5c18ea2be04f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/frontswap.rst @@ -0,0 +1,196 @@ +:Original: Documentation/mm/_free_page_reporting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========= +Frontswap +========= + +Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 +于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 +节省(提高)。 + +.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ + +Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 +储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory +(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 +求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 +调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops +的功能,它提供的功能必须符合某些策略,如下所示: + +一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap +交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 +量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 +内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 +transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 +相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 + +一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 +要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 +经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, +也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 +可以像往常一样被写入交换空间。 + +请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” +的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 +不会从frontswap中获得。 + +如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 +debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): + +``failed_stores`` + 有多少次存储的尝试是失败的 + +``loads`` + 尝试了多少次加载(应该全部成功) + +``succ_stores`` + 有多少次存储的尝试是成功的 + +``invalidates`` + 尝试了多少次作废 + +后台实现可以提供额外的指标。 + +经常问到的问题 +============== + +* 价值在哪里? + +当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 +读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 +能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 +移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 +页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 + +Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 +灵活性: + +在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 +全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 +用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 +工作负载上,则有明显的性能改善(25%以上)。 + +“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory +的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 +统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 +交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 +可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 +有多少内存可用 + +在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 +用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 +是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory +后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, +而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 +的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap +允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), +从而减轻计划外交换可能带来的可怕的性能影响。 + +一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 +内存扩展技术的调查也在进行中。 + +* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? + +如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 +个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 +frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 +是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 +后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 +忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 +的,无论如何使用一小部分的 CPU 都是不相关的。 + +至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 +每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 +页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 +会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 +非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 + +当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 +产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 +来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 + +* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? + +我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 +明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 +供了多少内存是完全动态和随机的。 + +每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 +型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 +换页的尝试。 + +每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 +frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 +间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap +backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 +页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 +据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 +下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 +页面偏移量,否则它就会将数据写入该设备。 + +当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), +检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 +的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, +以便从真正的交换设备上获得这一页的数据。 + +所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 +入都被 “frontswap backend store” 和(可能)“frontswap backend loads” +所取代,这可能会快得多。 + +* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 + 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? + +首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 +结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 +假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 +及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 + +例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend +的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 +先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 +以根据当前的内存限制动态地定义。 + +此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 +块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 +不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 +态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 +复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 +的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” +线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, +KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 + +在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 +例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 +自己的交换。 + +transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 +能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, +frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 +容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 +swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap +就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 +backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 + + +* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, + 难道它不能总是被成功地覆盖吗? + +几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 +缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 +backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap +拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 +换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 + +* 为什么frontswap补丁会创建新的头文件swapfile.h? + +frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 +在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 +个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 + +Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/mm/highmem.rst b/Documentation/translations/zh_CN/mm/highmem.rst new file mode 100644 index 000000000000..81202c65e000 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/highmem.rst @@ -0,0 +1,128 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/highmem.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +========== +高内存处理 +========== + +作者: Peter Zijlstra + +.. contents:: :local: + +高内存是什么? +============== + +当物理内存的大小接近或超过虚拟内存的最大大小时,就会使用高内存(highmem)。在这一点上,内 +核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内 +存的临时映射。 + +没有被永久映射覆盖的那部分(物理)内存就是我们所说的 "高内存"。对于这个边界的确切位置,有 +各种架构上的限制。 + +例如,在i386架构中,我们选择将内核映射到每个进程的虚拟空间,这样我们就不必为内核的进入/退 +出付出全部的TLB作废代价。这意味着可用的虚拟内存空间(i386上为4GiB)必须在用户和内核空间之 +间进行划分。 + +使用这种方法的架构的传统分配方式是3:1,3GiB用于用户空间,顶部的1GiB用于内核空间。:: + + +--------+ 0xffffffff + | Kernel | + +--------+ 0xc0000000 + | | + | User | + | | + +--------+ 0x00000000 + +这意味着内核在任何时候最多可以映射1GiB的物理内存,但是由于我们需要虚拟地址空间来做其他事 +情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少(通常在~896MiB左右)。 + +其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而,一些硬件(如一些ARM)在使 +用mm上下文标签时,其虚拟空间有限。 + + +临时虚拟映射 +============ + +内核包含几种创建临时映射的方法。: + +* vmap(). 这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization + 来解除映射。 + +* kmap(). 这允许对单个页面进行短期映射。它需要synchronization,但在一定程度上被摊销。 + 当以嵌套方式使用时,它也很容易出现死锁,因此不建议在新代码中使用它。 + +* kmap_atomic(). 这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上, + 它表现得很好,但发布任务因此被要求留在该CPU上直到它完成,以免其他任务取代它的映射。 + + kmap_atomic() 也可以由中断上下文使用,因为它不睡眠,而且调用者可能在调用kunmap_atomic() + 之后才睡眠。 + + 可以假设k[un]map_atomic()不会失败。 + + +使用kmap_atomic +=============== + +何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存(见__GFP_HIGHMEM) +分配的页面的内容时,例如在页缓存中的页面,就会使用它。该API有两个函数,它们的使用方式与 +下面类似:: + + /* 找到感兴趣的页面。 */ + struct page *page = find_get_page(mapping, offset); + + /* 获得对该页内容的访问权。 */ + void *vaddr = kmap_atomic(page); + + /* 对该页的内容做一些处理。 */ + memset(vaddr, 0, PAGE_SIZE); + + /* 解除该页面的映射。 */ + kunmap_atomic(vaddr); + +注意,kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。 + +如果你需要映射两个页面,因为你想从一个页面复制到另一个页面,你需要保持kmap_atomic调用严 +格嵌套,如:: + + vaddr1 = kmap_atomic(page1); + vaddr2 = kmap_atomic(page2); + + memcpy(vaddr1, vaddr2, PAGE_SIZE); + + kunmap_atomic(vaddr2); + kunmap_atomic(vaddr1); + + +临时映射的成本 +============== + +创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。 + +如果CONFIG_HIGHMEM没有被设置,那么内核会尝试用一点计算来创建映射,将页面结构地址转换成 +指向页面内容的指针,而不是去捣鼓映射。在这种情况下,解映射操作可能是一个空操作。 + +如果CONFIG_MMU没有被设置,那么就不可能有临时映射和高内存。在这种情况下,也将使用计算方法。 + + +i386 PAE +======== + +在某些情况下,i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果: + +* Linux需要为系统中的每个页面建立一个页帧结构,而且页帧需要驻在永久映射中,这意味着: + +* 你最多可以有896M/sizeof(struct page)页帧;由于页结构体是32字节的,所以最终会有 + 112G的页;然而,内核需要在内存中存储更多的页帧...... + +* PAE使你的页表变大--这使系统变慢,因为更多的数据需要在TLB填充等方面被访问。一个好处 + 是,PAE有更多的PTE位,可以提供像NX和PAT这样的高级功能。 + +一般的建议是,你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作 +量有用,但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。 diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst new file mode 100644 index 000000000000..5024a8a15516 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hmm.rst @@ -0,0 +1,361 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/hmm.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +================== +异构内存管理 (HMM) +================== + +提供基础设施和帮助程序以将非常规内存(设备内存,如板上 GPU 内存)集成到常规内核路径中,其 +基石是此类内存的专用struct page(请参阅本文档的第 5 至 7 节)。 + +HMM 还为 SVM(共享虚拟内存)提供了可选的帮助程序,即允许设备透明地访问与 CPU 一致的程序 +地址,这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得 +必不可少,其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。 + +本文档分为以下部分:在第一部分中,我揭示了与使用特定于设备的内存分配器相关的问题。在第二 +部分中,我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页 +表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后, +最后一节介绍了一个新的迁移助手,它允许利用设备 DMA 引擎。 + +.. contents:: :local: + +使用特定于设备的内存分配器的问题 +================================ + +具有大量板载内存(几 GB)的设备(如 GPU)历来通过专用驱动程序特定 API 管理其内存。这会 +造成设备驱动程序分配和管理的内存与常规应用程序内存(私有匿名、共享内存或常规文件支持内存) +之间的隔断。从这里开始,我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况: +即,设备可以透明地使用任何应用程序内存区域。 + +分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来 +看,程序中的所有内存对象并不平等,这使得依赖于广泛的库的大型程序变得复杂。 + +具体来说,这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存(malloc、mmap +私有、mmap 共享)和通过设备驱动程序 API 分配的内存之间复制对象(这仍然以 mmap 结束, +但是是设备文件)。 + +对于平面数据集(数组、网格、图像……),这并不难实现,但对于复杂数据集(列表、树……), +很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错, +而且由于数据集和地址的重复,程序更难调试。 + +分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据,因此每个库可能 +不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响,并因为各种内存 +拷贝而浪费资源。 + +复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出,并不是一个可行的选择。 +这将导致库入口点的组合爆炸。 + +最后,随着高级语言结构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有可能在没有程 +序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有 +其他模式,使用共享地址空间也更合理。 + + +I/O 总线、设备内存特性 +====================== + +由于一些限制,I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本 +内存访问;甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下,它 +不是缓存一致的。 + +如果我们只考虑 PCIE 总线,那么设备可以访问主内存(通常通过 IOMMU)并与 CPU 缓存一 +致。但是,它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟:CPU +只能访问有限范围的设备内存,而不能对其执行原子操作。因此,从内核的角度来看,设备内存不 +能被视为与常规内存等同。 + +另一个严重的因素是带宽有限(约 32GBytes/s,PCIE 4.0 和 16 通道)。这比最快的 GPU +内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自 +己的内存时高一个数量级。 + +一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制 +(OpenCAPI、CCIX)。它们主要允许 CPU 和设备之间的双向缓存一致性,并允许架构支持的所 +有原子操作。遗憾的是,并非所有平台都遵循这一趋势,并且一些主要架构没有针对这些问题的硬 +件解决方案。 + +因此,为了使共享地址空间有意义,我们不仅必须允许设备访问任何内存,而且还必须允许任何内 +存在设备使用时迁移到设备内存(在迁移时阻止 CPU 访问)。 + + +共享地址空间和迁移 +================== + +HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间,因此对 +于进程地址空间中的任何有效主内存地址,相同的地址指向相同的物理内存。 + +为了实现这一点,HMM 提供了一组帮助程序来填充设备页表,同时跟踪 CPU 页表更新。设备页表 +更新不像 CPU 页表更新那么容易。要更新设备页表,您必须分配一个缓冲区(或使用预先分配的 +缓冲区池)并在其中写入 GPU 特定命令以执行更新(取消映射、缓存失效和刷新等)。这不能通 +过所有设备的通用代码来完成。因此,为什么HMM提供了帮助器,在把硬件的具体细节留给设备驱 +动程序的同时,把一切可以考虑的因素都考虑进去了。 + +HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存,它允许为设备内存的每个页面分配一个 +struct page。这些页面很特殊,因为 CPU 无法映射它们。然而,它们允许使用现有的迁移机 +制将主内存迁移到设备内存,从 CPU 的角度来看,一切看起来都像是换出到磁盘的页面。使用 +struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次,HMM 仅提供帮助程序, +首先为设备内存热插拔新的 ZONE_DEVICE 内存,然后执行迁移。迁移内容和时间的策略决定留 +给设备驱动程序。 + +请注意,任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如,当支持给定CPU +地址 A 的页面从主内存页面迁移到设备页面时,对地址 A 的任何 CPU 访问都会触发缺页异常 +并启动向主内存的迁移。 + +凭借这两个特性,HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步,而且还通 +过迁移设备正在使用的数据集部分来利用设备内存。 + + +地址空间镜像实现和API +===================== + +地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中;HMM 有助于 +保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier +开始:: + + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); + +在 ops->invalidate() 回调期间,设备驱动程序必须对范围执行更新操作(将范围标记为只 +读,或完全取消映射等)。设备必须在驱动程序回调返回之前完成更新。 + +当设备驱动程序想要填充一个虚拟地址范围时,它可以使用:: + + int hmm_range_fault(struct hmm_range *range); + +如果请求写访问,它将在丢失或只读条目上触发缺页异常(见下文)。缺页异常使用通用的 mm 缺 +页异常代码路径,就像 CPU 缺页异常一样。 + +这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟 +范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。 + +在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面,以保 +持事物正确同步。使用模式是:: + + int driver_populate_range(...) + { + struct hmm_range range; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub->notifier.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + mmap_read_lock(mm); + ret = hmm_range_fault(&range); + if (ret) { + mmap_read_unlock(mm); + if (ret == -EBUSY) + goto again; + return ret; + } + mmap_read_unlock(mm); + + take_lock(driver->update); + if (mmu_interval_read_retry(&ni, range.notifier_seq) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + +driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用 +mmu_interval_read_retry() 之前保持,以避免与并发 CPU 页表更新发生任何竞争。 + +利用 default_flags 和 pfn_flags_mask +==================================== + +hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围 +的故障或快照策略,而不必为 pfns 数组中的每个条目设置它们。 + +例如,如果设备驱动程序需要至少具有读取权限的范围的页面,它会设置:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = 0; + +并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。 + +现在假设驱动程序想要做同样的事情,除了它想要拥有写权限的范围内的一页。现在驱动程序设 +置:: + + range->default_flags = HMM_PFN_REQ_FAULT; + range->pfn_flags_mask = HMM_PFN_REQ_WRITE; + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; + +有了这个,HMM 将在至少读取(即有效)的所有页面中异常,并且对于地址 +== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限,即,如果 +CPU pte 没有设置写权限,那么HMM将调用handle_mm_fault()。 + +hmm_range_fault 完成后,标志位被设置为页表的当前状态,即 HMM_PFN_VALID | 如果页 +面可写,将设置 HMM_PFN_WRITE。 + + +从核心内核的角度表示和管理设备内存 +================================== + +尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存 +的信息,HMM 将自身挂接到 mm 代码的各个位置,以处理对设备内存支持的地址的任何访问。 +事实证明,这最终复制了 struct page 的大部分字段,并且还需要更新许多内核代码路径才 +能理解这种新的内存类型。 + +大多数内核代码路径从不尝试访问页面后面的内存,而只关心struct page的内容。正因为如此, +HMM 切换到直接使用 struct page 用于设备内存,这使得大多数内核代码路径不知道差异。 +我们只需要确保没有人试图从 CPU 端映射这些页面。 + +移入和移出设备内存 +================== + +由于 CPU 无法直接访问设备内存,因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存 +储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和 +migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。 + +在将页面迁移到设备私有内存之前,需要创建特殊的设备私有 ``struct page`` 。这些将用 +作特殊的“交换”页表条目,以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。 + +这些可以通过以下方式分配和释放:: + + struct resource *res; + struct dev_pagemap pagemap; + + res = request_free_mem_region(&iomem_resource, /* number of bytes */, + "name of driver resource"); + pagemap.type = MEMORY_DEVICE_PRIVATE; + pagemap.range.start = res->start; + pagemap.range.end = res->end; + pagemap.nr_range = 1; + pagemap.ops = &device_devmem_ops; + memremap_pages(&pagemap, numa_node_id()); + + memunmap_pages(&pagemap); + release_mem_region(pagemap.range.start, range_len(&pagemap.range)); + +还有devm_request_free_mem_region(), devm_memremap_pages(), +devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``. + +整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration `) , +但这些步骤分为设备驱动程序特定代码和共享公共代码: + +1. ``mmap_read_lock()`` + + 设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(), + 因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。 + +2. ``migrate_vma_setup(struct migrate_vma *args)`` + + 设备驱动初始化了 ``struct migrate_vma`` 的字段,并将该指针传递给 + migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。 + 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存,设置 + ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页 + 面。如果后者被设置, ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备 + 私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前,只有匿名的私有VMA + 范围可以被迁移到系统内存和设备私有内存。 + + migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()`` + 和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表,使 + 其他设备的MMU无效,以便在 ``args->src`` 数组中填写要迁移的PFN。 + ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` , + 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给 + migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无 + 效化回调,只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。 + + + 在遍历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效 + 的 “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清 + 除它,而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被 + ``lock_page()``锁定,与LRU隔离(如果系统内存和设备私有页不在LRU上),从进 + 程中取消映射,并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup() + 还清除了 ``args->dst`` 数组。 + +3. 设备驱动程序分配目标页面并将源页面复制到目标页面。 + + 驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已 + 设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选 + 择跳过页面迁移。 + + 然后,驱动程序分配一个设备私有 struct page 或一个系统内存页,用 ``lock_page()`` + 锁定该页,并将 ``dst`` 数组条目填入:: + + dst[i] = migrate_pfn(page_to_pfn(dpage)); + + 现在驱动程序知道这个页面正在被迁移,它可以使设备私有 MMU 映射无效并将设备私有 + 内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失 + 效,因此设备驱动程序只需使其自己的 MMU 映射失效。 + + 驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的 + ``struct page`` 面,并将源页面复制到目标设备上,如果指针为 ``NULL`` ,意 + 味着源页面没有被填充到系统内存中,则清除目标设备的私有内存。 + +4. ``migrate_vma_pages()`` + + 这一步是实际“提交”迁移的地方。 + + 如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分配的页会被插 + 入到CPU的页表中。如果一个CPU线程在同一页面上发生异常,这可能会失败。然而,页 + 表被锁定,只有一个新页会被插入。如果它失去了竞争,设备驱动将看到 + ``MIGRATE_PFN_MIGRATE`` 位被清除。 + + 如果源页被锁定、隔离等,源 ``struct page`` 信息现在被复制到目标 + ``struct page`` ,最终完成CPU端的迁移。 + +5. 设备驱动为仍在迁移的页面更新设备MMU页表,回滚未迁移的页面。 + + 如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置,设备驱动可以 + 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` 位被设置,则设置写启用位。 + +6. ``migrate_vma_finalize()`` + + 这一步用新页的页表项替换特殊的迁移页表项,并释放对源和目的 ``struct page`` + 的引用。 + +7. ``mmap_read_unlock()`` + + 现在可以释放锁了。 + +独占访问存储器 +============== + +一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 +个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU +的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 +个内存范围不能从用户空间访问。 + +这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 +导致一个异常,该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已 +经被MMU通知器改变,之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序 +放弃页面锁和页面引用为止,这时页面上的任何CPU异常都可以按所述进行。 + +内存 cgroup (memcg) 和 rss 统计 +=============================== + +目前,设备内存被视为 rss 计数器中的任何常规页面(如果设备页面用于匿名,则为匿名, +如果设备页面用于文件支持页面,则为文件,如果设备页面用于共享内存,则为 shmem)。 +这是为了保持现有应用程序的故意选择,这些应用程序可能在不知情的情况下开始使用设备 +内存,运行不受影响。 + +一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序, +因此不会释放太多系统内存。在决定以不同方式计算设备内存之前,我们希望收集更多关 +于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。 + +对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算,常规 +页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规 +内存不会失败,因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对 +内存资源控制的影响有了更多的了解,我们可能会在后面重新考虑这个选择。 + +请注意,设备内存永远不能由设备驱动程序或通过 GUP 固定,因此此类内存在进程退出时 +总是被释放的。或者在共享内存或文件支持内存的情况下,当删除最后一个引用时。 diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst new file mode 100644 index 000000000000..752e5696cd47 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -0,0 +1,436 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/hugetlbfs_reserv.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +============== +Hugetlbfs 预留 +============== + +概述 +==== + +:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指 +示要使用巨页,这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常 +时没有巨页存在,任务就会被发送一个SIGBUS,并经常不高兴地死去。在加入巨页支 +持后不久,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果 +没有足够的巨页来覆盖映射,mmap()将失败。这首先是在mmap()时在代码中做一个 +简单的检查,以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一 +样,代码随着时间的推移而不断发展。然而,基本的想法是在mmap()时 “预留” +巨页,以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核 +中是如何进行巨页预留处理的。 + + +读者 +==== +这个描述主要是针对正在修改hugetlbfs代码的内核开发者。 + + +数据结构 +======== + +resv_huge_pages + 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页只对预留它们的任 + 务可用。因此,一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。 +Reserve Map + 预留映射由以下结构体描述:: + + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + + 系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的 + 区域。一个区域被描述为:: + + struct file_region { + struct list_head link; + long from; + long to; + }; + + file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型,在 + reserv_map 中的一个区域可能表示该范围存在预留,或预留不存在。 +Flags for MAP_PRIVATE Reservations + 这些被存储在预留的映射指针的底部。 + + ``#define HPAGE_RESV_OWNER (1UL << 0)`` + 表示该任务是与该映射相关的预留的所有者。 + ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` + 表示最初映射此范围(并创建储备)的任务由于COW失败而从该任务(子任务)中取消映 + 射了一个页面。 +Page Flags + PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在 + “释放巨页” 一节中讨论。 + + +预留映射位置(私有或共享) +========================== + +一个巨页映射或段要么是私有的,要么是共享的。如果是私有的,它通常只对一个地址空间 +(任务)可用。如果是共享的,它可以被映射到多个地址空间(任务)。对于这两种类型的映射, +预留映射的位置和语义是明显不同的。位置的差异是: + +- 对于私有映射,预留映射挂在VMA结构体上。具体来说,就是vma->vm_private_data。这个保 + 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。 +- 对于共享映射,预留映射挂在inode上。具体来说,就是inode->i_mapping->private_data。 + 由于共享映射总是由hugetlbfs文件系统中的文件支持,hugetlbfs代码确保每个节点包含一个预 + 留映射。因此,预留映射在创建节点时被分配。 + + +创建预留 +======== +当创建一个巨大的有页面支持的共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB) +创建一个映射时,就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用:: + + int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE +标志。如果指定了NORESERVE,那么这个函数立即返回,因为不需要预留。 + +参数'from'和'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射 +的长度。对于mmap(),offset参数可以用来指定进入底层文件的偏移量。在这种情况下,'from'和'to' +参数已经被这个偏移量所调整。 + +PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。 + +- 对于共享映射,预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时,预留映射不被 + 修改。 +- 对于私有映射,预留映射中没有条目表示相应页面存在预留。随着预留被消耗,条目被添加到预留映射中。 + 因此,预留映射也可用于确定哪些预留已被消耗。 + +对于私有映射,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外, +HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 + +预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射,这始终是一个值(to - from)。 +然而,对于共享映射来说,一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节, +请参见 :ref:`预留映射的修改 ` 一节。 + +该映射可能与一个子池(subpool)相关联。如果是这样,将查询子池以确保有足够的空间用于映射。子池 +有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 ` +一节。 + +在咨询了预留映射和子池之后,就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查 +并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而,在这 +些函数中,代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话,全局预留计数resv_huge_pages +会被调整,如下所示:: + + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; + +注意,在检查和调整这些计数器时,全局锁hugetlb_lock会被预留。 + +如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以 +反映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于私有映射, +不对预留映射进行修改,因为没有条目表示存在预留。 + +如果hugetlb_reserve_pages()成功,全局预留数和与映射相关的预留映射将根据需要被修改,以确保 +在'from'-'to'范围内存在预留。 + +消耗预留/分配一个巨页 +=========================== + +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +中进行的:: + + struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + +alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 +外拷贝被分配。 + + +调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详 +细内容,请参见 :ref:`预留映射帮助函数 ` 一节。从 +vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留,则为0,如果不存在预留,则为1。 +如果不存在预留,并且有一个与映射相关联的子池,则查询子池以确定它是否包含预留。如果子池包含预留, +则可将其中一个用于该分配。然而,在任何情况下,avoid_reserve参数都会优先考虑为分配使用预留。在 +确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 +的参数: + +- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, + 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 + 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 + +与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面,当该页面从空闲列表中移除时, +free_huge_pages的值被递减。如果有一个与该页相关的预留,将进行以下调整:: + + SetPagePrivate(page); /* 表示分配这个页面消耗了一个预留, + * 如果遇到错误,以至于必须释放这个页面,预留将被 + * 恢复。 */ + resv_huge_pages--; /* 减少全局预留计数 */ + +注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 +的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: +SetPagePrivate(page) 和 resv_huge_pages--. + +在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +面被释放时,这将被用于子池的计数。 + +然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 +到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 +已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建一 +个新的条目。 + +注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 +的剩余巨页和过度分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整。 +SetPagePrivate(page)和resv_huge_pages-。 + +在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +面被释放时,这将被用于子池的计数。 + +然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 +到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 +已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 +一个新的条目。 + +在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 +享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 +这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 +识别。如果检测到这种竞争,子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息,请 +参见 :ref:`预留映射帮助函数 ` 一节。 + + +实例化巨页 +========== + +在巨页分配之后,页面通常被添加到分配任务的页表中。在此之前,共享映射中的页面被添加到页面缓 +存中,私有映射中的页面被添加到匿名反向映射中。在这两种情况下,PagePrivate标志被清除。因此, +当一个已经实例化的巨页被释放时,不会对全局预留计数(resv_huge_pages)进行调整。 + + +释放巨页 +======== + +巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 +递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 +留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 + +page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置,它表明全局预留计数 +应该被调整(关于如何设置这些标志的信息,请参见 +:ref: `消耗预留/分配一个巨页 ` )。 + + +该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值(不等于 +传递的1的值),它表明预留与子池相关联,这个新释放的页面必须被用来保持子池预留的数量超过最小值。 +因此,在这种情况下,全局resv_huge_pages计数器被递增。 + +如果页面中设置了PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。 + +子池预留 +======== + +有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一 +个hstate中的页面子集,它与一个已挂载的hugetlbfs文件系统相关 + +当一个hugetlbfs文件系统被挂载时,可以指定min_size选项,它表示文件系统所需的最小的巨页数量。 +如果指定了这个选项,与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体 +hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages) +被调用以预留指定数量的巨页。如果它们不能被预留,挂载就会失败。 + +当从子池中获取或释放页面时,会调用hugepage_subpool_get/put_pages()函数。 +hugepage_subpool_get/put_pages被传递给巨页数量,以此来调整子池的 “已用页面” 计数 +(get为下降,put为上升)。通常情况下,如果子池中没有足够的页面,它们会返回与传递的相同的值或 +一个错误。 + +然而,如果预留与子池相关联,可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局 +池调整的数量。例如,假设一个子池包含3个预留的巨页,有人要求5个。与子池相关的3个预留页可以用来 +满足部分请求。但是,必须从全局池中获得2个页面。为了向调用者转达这一信息,将返回值2。然后,调用 +者要负责从全局池中获取另外两个页面。 + + +COW和预留 +========== + +由于共享映射都指向并使用相同的底层页面,COW最大的预留问题是私有映射。在这种情况下,两个任务可 +以指向同一个先前分配的页面。一个任务试图写到该页,所以必须分配一个新的页,以便每个任务都指向它 +自己的页。 + +当该页最初被分配时,该页的预留被消耗了。当由于COW而试图分配一个新的页面时,有可能没有空闲的巨 +页,分配会失败。 + +当最初创建私有映射时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。 +由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常发生并且没有可用的页面 +时,对预留的所有者和非所有者采取不同的行动。 + +在发生异常的任务不是所有者的情况下,异常将失败,该任务通常会收到一个SIGBUS。 + +如果所有者是发生异常的任务,我们希望它能够成功,因为它拥有原始的预留。为了达到这个目的,该页被 +从非所有者任务中解映射出来。这样一来,唯一的引用就是来自拥有者的任务。此外,HPAGE_RESV_UNMAPPED +位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常,它可能 +会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。 + +预留映射的修改 +============== + +以下低级函数用于对预留映射进行修改。通常情况下,这些函数不会被直接调用。而是调用一个预留映射辅 +助函数,该函数调用这些低级函数中的一个。这些低级函数在源代码(mm/hugetlb.c)中得到了相当好的 +记录。这些函数是:: + + long region_chg(struct resv_map *resv, long f, long t); + long region_add(struct resv_map *resv, long f, long t); + void region_abort(struct resv_map *resv, long f, long t); + long region_count(struct resv_map *resv, long f, long t); + +在预留映射上的操作通常涉及两个操作: + +1) region_chg()被调用来检查预留映射,并确定在指定的范围[f, t]内有多少页目前没有被代表。 + + 调用代码执行全局检查和分配,以确定是否有足够的巨页使操作成功。 + +2) + a) 如果操作能够成功,regi_add()将被调用,以实际修改先前传递给regi_chg()的相同范围 + [f, t]的预留映射。 + b) 如果操作不能成功,region_abort被调用,在相同的范围[f, t]内中止操作。 + +注意,这是一个两步的过程, region_add()和 region_abort()在事先调用 region_chg()后保证 +成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作(特别是 region_add())的 +成功。 + +如上所述,region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加 +到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相 +同。然而,在共享映射的情况下,有可能在调用 region_chg() 和 region_add() 之间对预留映射进 +行更改。在这种情况下,regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下,全局计数 +和子池计数很可能是不正确的,需要调整。检查这种情况并进行适当的调整是调用者的责任。 + +函数region_del()被调用以从预留映射中移除区域。 +它通常在以下情况下被调用: + +- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射 + 之前,所有单独的file_region结构体必须被释放。在这种情况下,region_del的范围是[0, LONG_MAX]。 +- 当一个hugetlbfs文件正在被截断时。在这种情况下,所有在新文件大小之后分配的页面必须被释放。 + 此外,预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下,region_del + 的范围是[new_end_of_file, LONG_MAX]。 +- 当在一个hugetlbfs文件中打洞时。在这种情况下,巨页被一次次从文件的中间移除。当这些页被移除 + 时,region_del()被调用以从预留映射中移除相应的条目。在这种情况下,region_del被传递的范 + 围是[page_idx, page_idx + 1]。 + +在任何情况下,region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下,region_del() +会失败。这只能发生在打洞的情况下,即它必须分割一个现有的file_region条目,而不能分配一个新的 +结构体。在这种错误情况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有 +预留。然而,子池和全局预留计数将不反映该预留。为了处理这种情况,调用函数hugetlb_fix_reserve_counts() +来调整计数器,使其与不能被删除的预留映射条目相对应。 + +region_count()在解除私有巨页映射时被调用。在私有映射中,预留映射中没有条目表明存在一个预留。 +因此,通过计算预留映射中的条目数,我们知道有多少预留被消耗了,有多少预留是未完成的 +(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消 +失,子池和全局预留计数被未完成的预留数量所减去。 + +预留映射帮助函数 +================ + +有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣,所以它们只是传入一个 +地址而不是一个范围。此外,它们还传入相关的VMA。从VMA中,可以确定映射的类型(私有或共享)和预留 +映射的位置(inode或VMA)。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而, +它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义,并向调用者隐藏了这个细节:: + + long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +该函数为指定的页面调用 region_chg()。如果不存在预留,则返回1。如果存在预留,则返回0:: + + long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这将调用 region_add(),用于指定的页面。与region_chg和region_add的情况一样,该函数应在 +先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加,它将 +返回1,如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出 +现意外的差异,说明在两次调用之间修改了预留映射:: + + void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样,该函数应在 +先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作:: + + long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, + unsigned long addr) + +这是一个特殊的包装函数,有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数 +中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到 +了私有和共享映射的不同预留映射语义。因此,region_add被调用用于共享映射(因为映射中的条目表 +示预留),而region_del被调用用于私有映射(因为映射中没有条目表示预留)。关于在错误路径上需 +要做什么的更多信息,请参见 “错误路径中的预留清理” 。 + + +错误路径中的预留清理 +==================== + +正如在:ref:`预留映射帮助函数` 一节中提到的,预留的修改分两步进行。首 +先,在分配页面之前调用vma_needs_reservation。如果分配成功,则调用vma_commit_reservation。 +如果不是,则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整, +一切都很好。 + +此外,在一个巨页被实例化后,PagePrivate标志被清空,这样,当页面最终被释放时,计数是 +正确的。 + +然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, +页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 +(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 +显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, +并阻止分配一个预先分配的页面。 + +函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的 +是将预留映射恢复到页面分配前的状态。通过这种方式,预留映射的状态将与页面释放后的全局预留计 +数相对应。 + +函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下, +它将简单地清除该页的PagePrivate标志。这样一来,当页面被释放时,全局预留计数将不会被递增。 +然而,预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址,但它不会像最 +初设想的那样使用一个预留页。 + +有一些代码(最明显的是userfaultfd)不能调用restore_reserve_on_error。在这种情况下, +它简单地修改了PagePrivate,以便在释放巨页时不会泄露预留。 + + +预留和内存策略 +============== +当git第一次被用来管理Linux代码时,每个节点的巨页列表就存在于hstate结构中。预留的概念是 +在一段时间后加入的。当预留被添加时,没有尝试将内存策略考虑在内。虽然cpusets与内存策略不 +完全相同,但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作 +用:: + + + /* + * 当cpuset被配置时,它打破了严格的hugetlb页面预留,因为计数是在一个全局变量上完 + * 成的。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根据当前cpuset的 + * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时,应用程序仍然有可能 + * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的(或者说太难看了),因 + * 为cpuset太不稳定了,任务或内存节点可以在cpuset之间动态移动。与cpuset共享 + * hugetlb映射的语义变化是不可取的。然而,为了预留一些语义,我们退回到检查当前空闲 + * 页的可用性,作为一种最好的尝试,希望能将cpuset改变语义的影响降到最低。 + */ + +添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败(OOM)。然而,如果一个应用 +程序使用cpusets或内存策略,就不能保证在所需的节点上有巨页可用。即使有足够数量的全局 +预留,也是如此。 + +Hugetlbfs回归测试 +================= + +最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码,请使用 +libhugetlbfs测试套件来检查回归情况。此外,如果你添加了任何新的hugetlb功能,请在 +libhugetlbfs中添加适当的测试。 + +-- +Mike Kravetz,2017年4月7日 diff --git a/Documentation/translations/zh_CN/mm/hwpoison.rst b/Documentation/translations/zh_CN/mm/hwpoison.rst new file mode 100644 index 000000000000..310862edc937 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/hwpoison.rst @@ -0,0 +1,166 @@ + +:Original: Documentation/mm/hwpoison.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +hwpoison +======== + +什么是hwpoison? +=============== + + +即将推出的英特尔CPU支持从一些内存错误中恢复( ``MCA恢复`` )。这需要操作系统宣布 +一个页面"poisoned",杀死与之相关的进程,并避免在未来使用它。 + +这个补丁包在虚拟机中实现了必要的(编程)框架。 + +引用概述中的评论:: + + 高级机器的检查与处理。处理方法是损坏的页面被硬件报告,通常是由于2位ECC内 + 存或高速缓存故障。 + + 这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时,当前运行的进程 + 可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理,就可 + 以安全地忽略它. 而不是用另外一个机器检查去处理它。 + + 处理不同状态的页面缓存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们可以异 + 步访问任何页面。因为内存故障可能随时随地发生,可能违反了他们的一些假设。这就是 + 为什么这段代码必须非常小心。一般来说,它试图使用正常的锁规则,如获得标准锁,即使 + 这意味着错误处理可能需要很长的时间。 + + 这里的一些操作有点低效,并且具有非线性的算法复杂性,因为数据结构没有针对这种情 + 况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的,所 + 以我们希望我们可以摆脱这种情况。 + +该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的 +各种检查组成,用来处理poison的页面。 + +现在主要目标是KVM客户机,但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm +版本。 + +对于KVM的使用,需要一个新的信号类型,这样KVM就可以用适当的地址将机器检查注入到客户 +机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是,所有的应用程序都不要这 +样做,但一些非常专业的应用程序可能会这样做。 + +故障恢复模式 +============ + +有两种(实际上是三种)模式的内存故障恢复可以在。 + +vm.memory_failure_recovery sysctl 置零: + 所有的内存故障都会导致panic。请不要尝试恢复。 + +早期处理 + (可以在全局和每个进程中控制) 一旦检测到错误,立即向应用程序发送SIGBUS这允许 + 应用程序以温和的方式处理内存错误(例如,放弃受影响的对象) 这是KVM qemu使用的 + 模式。 + +推迟处理 + 当应用程序运行到损坏的页面时,发送SIGBUS。这对不知道内存错误的应用程序来说是 + 最好的,默认情况下注意一些页面总是被当作late kill处理。 + +用户控制 +======== + +vm.memory_failure_recovery + 参阅 sysctl.txt + +vm.memory_failure_early_kill + 全局启用early kill + +PR_MCE_KILL + 设置early/late kill mode/revert 到系统默认值。 + + arg1: PR_MCE_KILL_CLEAR: + 恢复到系统默认值 + arg1: PR_MCE_KILL_SET: + arg2定义了线程特定模式 + + PR_MCE_KILL_EARLY: + Early kill + PR_MCE_KILL_LATE: + Late kill + PR_MCE_KILL_DEFAULT + 使用系统全局默认值 + + 注意,如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO),你应该在 + 指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则,SIGBUS将被发送到主线程。 + +PR_MCE_KILL_GET + 返回当前模式 + +测试 +==== + +* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面 + +* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块 + + corrupt-pfn + 在PFN处注入hwpoison故障,并echoed到这个文件。这做了一些早期过滤,以避 + 免在测试套件中损坏非预期页面。 + unpoison-pfn + 在PFN的Software-unpoison页面对应到这个文件。这样,一个页面可以再次被 + 复用。这只对Linux注入的故障起作用,对真正的内存故障不起作用。 + + 注意这些注入接口并不稳定,可能会在不同的内核版本中发生变化 + + corrupt-filter-dev-major, corrupt-filter-dev-minor + 只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通 + 配符值。这应该只用于人工注入的测试。 + + corrupt-filter-memcg + 限制注入到memgroup拥有的页面。由memcg的inode号指定。 + + Example:: + + mkdir /sys/fs/cgroup/mem/hwpoison + + usemem -m 100 -s 1000 & + echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks + + memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') + echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg + + page-types -p `pidof init` --hwpoison # shall do nothing + page-types -p `pidof usemem` --hwpoison # poison its pages + + corrupt-filter-flags-mask, corrupt-filter-flags-value + 当指定时,只有在((page_flags & mask) == value)的情况下才会poison页面。 + 这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相 + 同。这些标志位在include/linux/kernel-page-flags.h中定义,并在 + Documentation/admin-guide/mm/pagemap.rst中记录。 + +* 架构特定的MCE注入器 + + x86 有 mce-inject, mce-test + + 在mce-test中的一些便携式hwpoison测试程序,见下文。 + +引用 +==== + +http://halobates.de/mce-lc09-2.pdf + 09年LinuxCon的概述演讲 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git + 测试套件(在tsrc中的hwpoison特定可移植测试)。 + +git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git + x86特定的注入器 + + +限制 +==== +- 不是所有的页面类型都被支持,而且永远不会。大多数内核内部对象不能被恢 + 复,目前只有LRU页。 + +--- +Andi Kleen, 2009年10月 diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst new file mode 100644 index 000000000000..4c8c6b7b72a3 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/index.rst @@ -0,0 +1,54 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/index.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +================= +Linux内存管理文档 +================= + +这是一个关于Linux内存管理(mm)子系统内部的文档集,其中有不同层次的细节,包括注释 +和邮件列表的回复,用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建 +议,请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。 +对于控制和调整指南,请参阅(Documentation/admin-guide/mm/index)。 +TODO:待引用文档集被翻译完毕后请及时修改此处) + +.. toctree:: + :maxdepth: 1 + + active_mm + balance + damon/index + free_page_reporting + highmem + ksm + frontswap + hmm + hwpoison + hugetlbfs_reserv + memory-model + mmu_notifier + numa + overcommit-accounting + page_frags + page_owner + page_table_check + remap_file_pages + split_page_table_lock + z3fold + zsmalloc + +TODOLIST: +* arch_pgtable_helpers +* free_page_reporting +* hugetlbfs_reserv +* page_migration +* slub +* transhuge +* unevictable-lru +* vmalloced-kernel-stacks diff --git a/Documentation/translations/zh_CN/mm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst new file mode 100644 index 000000000000..d1f82e857ad7 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/ksm.rst @@ -0,0 +1,70 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/mm/ksm.rst + +:翻译: + + 徐鑫 xu xin + +============ +内核同页合并 +============ + +KSM 是一种节省内存的数据去重功能,由CONFIG_KSM=y启用,并在2.6.32版本时被添加 +到Linux内核。详见 ``mm/ksm.c`` 的实现,以及http://lwn.net/Articles/306704和 +https://lwn.net/Articles/330589 + +KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst +文档中有描述。 + +设计 +==== + +概述 +---- + +概述内容请见mm/ksm.c文档中的“DOC: Overview” + +逆映射 +------ +KSM维护着稳定树中的KSM页的逆映射信息。 + +当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 +KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 +的 ``page->mapping`` 指向了该稳定树节点。 + +如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 +个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息,其中 ``page->mapping`` +指向该"副本"。 + +每个链以及链接到该链中的所有"副本"强制不变的是,它们代表了相同的写保护内存 +内容,尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。 + +这样一来,相比与无限的逆映射链表,稳定树的查找计算复杂性不受影响。但在稳定树 +本身中不能有重复的KSM页面内容仍然是强制要求。 + +由 ``max_page_sharing`` 强制决定的数据去重限制是必要的,以此来避免虚拟内存 +rmap链表变得过大。rmap的遍历具有O(N)的复杂度,其中N是共享页面的rmap_项(即 +虚拟映射)的数量,而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。 +因此,这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进 +程在稳定节点"链"上的遍历也是O(N),但这个N是稳定树"副本"的数量,而不是rmap项 +的数量,因此它对ksmd性能没有显著影响。实际上,最佳稳定树"副本"的候选节点将 +保留在"副本"列表的开头。 + +``max_page_sharing`` 的值设置得高了会促使更快的内存合并(因为将有更少的稳定 +树副本排队进入稳定节点chain->hlist)和更高的数据去重系数,但代价是在交换、压 +缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。 + +``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控 +的影响,高比值可能意味着稳定节点dup中存在碎片,这可以通过在ksmd中引入碎片算 +法来解决,该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup,以便释放 +那些仅包含极少rmap项的稳定节点"dup",但这可能会增加ksmd进程的CPU使用率,并可 +能会减慢应用程序在KSM页面上的只读计算。 + +KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本",以便删减过时了的稳定节点。 +这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。 + +参考 +==== +内核代码请见mm/ksm.c。 +涉及的函数(mm_slot ksm_scan stable_node rmap_item)。 diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst new file mode 100644 index 000000000000..77ec149a970c --- /dev/null +++ b/Documentation/translations/zh_CN/mm/memory-model.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/memory-model.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +============ +物理内存模型 +============ + +系统中的物理内存可以用不同的方式进行寻址。最简单的情况是,物理内存从地址0开 +始,跨越一个连续的范围,直到最大的地址。然而,这个范围可能包含CPU无法访问的 +小孔隙。那么,在完全不同的地址可能有几个连续的范围。而且,别忘了NUMA,即不 +同的内存库连接到不同的CPU。 + +Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每 +个架构都定义了它所支持的内存模型,默认的内存模型是什么,以及是否有可能手动 +覆盖该默认值。 + +所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页 +帧的状态。 + +无论选择哪种内存模型,物理页框号(PFN)和相应的 `struct page` 之间都存 +在一对一的映射关系。 + +每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn` +帮助函数,允许从PFN到 `struct page` 的转换,反之亦然。 + +FLATMEM +======= + +最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的 +物理内存。 + +在FLATMEM内存模型中,有一个全局的 `mem_map` 数组来映射整个物理内存。对 +于大多数架构,孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page` +对象从未被完全初始化。 + +为了分配 `mem_map` 数组,架构特定的设置代码应该调用free_area_init()函数。 +然而,在调用memblock_free_all()函数之前,映射数组是不能使用的,该函数 +将所有的内存交给页分配器。 + +一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下,特 +定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。 + +使用FLATMEM,PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET` +是 `mem_map` 数组的一个索引。 + +`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。 + +SPARSEMEM +========= + +SPARSEMEM是Linux中最通用的内存模型,它是唯一支持若干高级功能的内存模型, +如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟 +初始化。 + +SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构 +体表示,它包含 `section_mem_map` ,从逻辑上讲,它是一个指向 `struct page` +阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管理。区段的大小 +和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量 +来指定的,这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS` +是一个架构所支持的物理地址的实际宽度,而 `SECTION_SIZE_BITS` 是一个任 +意的值。 + +最大的段数表示为 `NR_MEM_SECTIONS` ,定义为 + +.. math:: + + NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} + +`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的 +大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数: + +* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时, `mem_sections` 数组是静态的,有 + `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。 +* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时, `mem_sections` 数组被动态分配。 + 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 + 内存区。 + +架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 + +通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 + "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 + 值决定。 + +Classic sparse在page->flags中编码了一个页面的段号,并使用PFN的高位来访问映射该页 +框的段。在一个区段内,PFN是指向页数组的索引。 + +Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操 +作。有一个全局的 `struct page *vmemmap` 指针,指向一个虚拟连续的 `struct page` +对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` 的偏移量是该页的PFN。 + +为了使用vmemmap,一个架构必须保留一个虚拟地址的范围,以映射包含内存映射的物理页,并 +确保 `vmemmap`指向该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法, +它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求, +它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。 + +虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分 +配的存储中。这种存储用vmem_altmap结构表示,最终通过一长串的函数调用传递给 +vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和 +:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。 + +ZONE_DEVICE +=========== +`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物理地址范 +围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下 +事实有关:这些地址范围的页面对象从未被在线标记过,而且必须对设备进行引用,而不仅仅 +是页面,以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` , +为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`, +:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` 服务。由于页面引 +用计数永远不会低于1,所以页面永远不会被追踪为空闲内存,页面的 `struct list_head lru` +空间被重新利用,用于向映射该内存的主机设备/驱动程序进行反向引用。 + +虽然 `SPARSEMEM` 将内存作为一个区段的集合,可以选择收集并合成内存块,但 +`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE` +内存从未被在线标记,因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界 +上。这个实现依赖于这种缺乏用户接口的约束,允许子段大小的内存范围被指定给 +:c:func:`arch_add_memory` ,即内存热插拔的上半部分。子段支持允许2MB作为 +:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。 + +`ZONE_DEVICE` 的用户是: + +* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 + +* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , + 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见Documentation/mm/hmm.rst。 + +* p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 + 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/mm/mmu_notifier.rst b/Documentation/translations/zh_CN/mm/mmu_notifier.rst new file mode 100644 index 000000000000..ce3664d1a410 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/mmu_notifier.rst @@ -0,0 +1,97 @@ +:Original: Documentation/mm/mmu_notifier.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + + +什么时候需要页表锁内通知? +========================== + +当清除一个pte/pmd时,我们可以选择通过在页表锁下(通知版的\*_clear_flush调用 +mmu_notifier_invalidate_range)通知事件。但这种通知并不是在所有情况下都需要的。 + +对于二级TLB(非CPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让 +IOMMU走CPU页表来访问进程的虚拟地址空间)。只有两种情况需要在清除pte/pmd时在持有页 +表锁的同时通知这些二级TLB: + + A) 在mmu_notifier_invalidate_range_end()之前,支持页的地址被释放。 + B) 一个页表项被更新以指向一个新的页面(COW,零页上的写异常,__replace_page(),...)。 + +情况A很明显,你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。 + +情况B更加微妙。为了正确起见,它需要按照以下序列发生: + + - 上页表锁 + - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify()) + - 设置页表项以指向新页 + +如果在设置新的pte/pmd值之前,清除页表项之后没有进行通知,那么你就会破坏设备的C11或 +C++11等内存模型。 + +考虑以下情况(设备使用类似于ATS/PASID的功能)。 + +两个地址addrA和addrB,这样|addrA - addrB| >= PAGE_SIZE,我们假设它们是COW的 +写保护(B的其他情况也适用)。 + +:: + + [Time N] -------------------------------------------------------------------- + CPU-thread-0 {尝试写到addrA} + CPU-thread-1 {尝试写到addrB} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {读取addrA并填充设备TLB} + DEV-thread-2 {读取addrB并填充设备TLB} + [Time N+1] ------------------------------------------------------------------ + CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} + CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+2] ------------------------------------------------------------------ + CPU-thread-0 {COW_step1: {更新页表以指向addrA的新页}} + CPU-thread-1 {COW_step1: {更新页表以指向addrB的新页}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {写入addrA,这是对新页面的写入} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+3] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {preempted} + CPU-thread-2 {} + CPU-thread-3 {写入addrB,这是一个写入新页的过程} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+4] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {} + DEV-thread-2 {} + [Time N+5] ------------------------------------------------------------------ + CPU-thread-0 {preempted} + CPU-thread-1 {} + CPU-thread-2 {} + CPU-thread-3 {} + DEV-thread-0 {从旧页中读取addrA} + DEV-thread-2 {从新页面读取addrB} + +所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之前 +就看到了addrB的新值。这就破坏了设备的总内存序。 + +当改变一个pte的写保护或指向一个新的具有相同内容的写保护页(KSM)时,将mmu_notifier_invalidate_range +调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程 +在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占,也是如此。 diff --git a/Documentation/translations/zh_CN/mm/numa.rst b/Documentation/translations/zh_CN/mm/numa.rst new file mode 100644 index 000000000000..b15cfeeb6dfb --- /dev/null +++ b/Documentation/translations/zh_CN/mm/numa.rst @@ -0,0 +1,101 @@ +:Original: Documentation/mm/numa.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +始于1999年11月,作者: + +========================== +何为非统一内存访问(NUMA)? +========================== + +这个问题可以从几个视角来回答:硬件观点和Linux软件视角。 + +从硬件角度看,NUMA系统是一个由多个组件或装配组成的计算机平台,每个组件可能包含0个或更多的CPU、 +本地内存和/或IO总线。为了简洁起见,并将这些物理组件/装配的硬件视角与软件抽象区分开来,我们在 +本文中称这些组件/装配为“单元”。 + +每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能 +不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如,交叉开关或点对点 +链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来,以创建NUMA平台,其中的单元与其 +他单元有多个距离。 + +对于Linux,感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中, +所有的内存都是可见的,并且可以从连接到任何单元的任何CPU中访问,缓存一致性是由处理器缓存和/或 +系统互连在硬件中处理。 + +内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元 +有多远。例如,连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和 +更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的(其他)单元。 + +平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反,这种架构是提供可扩展 +内存带宽的一种手段。然而,为了实现可扩展的内存带宽,系统和应用软件必须安排大部分的内存引用 +[cache misses]到“本地”内存——同一单元的内存,如果有的话——或者到最近的有内存的单元。 + +这就自然而然有了Linux软件对NUMA系统的视角: + +Linux将系统的硬件资源划分为多个软件抽象,称为“节点”。Linux将节点映射到硬件平台的物理单元 +上,对一些架构的细节进行了抽象。与物理单元一样,软件节点可能包含0或更多的CPU、内存和/或IO +总线。同样,对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快 +的访问时间和更高的有效带宽。 + +对于一些架构,如x86,Linux将“隐藏”任何代表没有内存连接的物理单元的节点,并将连接到该单元 +的任何CPU重新分配到代表有内存的单元的节点上。因此,在这些架构上,我们不能假设Linux将所有 +的CPU与一个给定的节点相关联,会看到相同的本地内存访问时间和带宽。 + +此外,对于某些架构,同样以x86为例,Linux支持对额外节点的仿真。对于NUMA仿真,Linux会将现 +有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部 +分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的,当与cpusets一起使用时, +可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst] + +对于每个有内存的节点,Linux构建了一个独立的内存管理子系统,有自己的空闲页列表、使用中页列表、 +使用统计和锁来调解访问。此外,Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE +中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求 +时要访问的区/节点。当一个区没有可用的内存来满足请求时,这种情况被称为“overflow 溢出”或 +“fallback 回退”。 + +由于一些节点包含多个包含不同类型内存的区,Linux必须决定是否对区列表进行排序,使分配回退到不同 +节点上的相同区类型,或同一节点上的不同区类型。这是一个重要的考虑因素,因为有些区,如DMA或DMA32, +代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距 +离排序的远程节点之前,它会尝试回退到同一节点的其他分区。 + +默认情况下,Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说,Linux将试 +图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能 +满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。 + +本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配 +一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构 +中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥 +远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分不平衡的情况下,任务可 +以在节点之间迁移,远离其初始节点和内核数据结构。 + +系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口,如taskset(1)和numactl(1),以及程 +序接口,如sched_setaffinity(2),来限制任务的迁移,以改善NUMA定位。此外,人们可以使用 +Linux NUMA内存策略修改内核的默认本地分配行为。 [见 +:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. + +系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点 +的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst] + +在不隐藏无内存节点的架构上,Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无 +内存的节点,“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反,它 +将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分配将由内核提供 +最近的可用内存来完成。这是同一机制的结果,该机制允许这种分配在一个包含内存的节点溢出时回退到 +其他附近的节点。 + +一些内核分配不希望或不能容忍这种分配回退行为。相反,他们想确保他们从指定的节点获得内存,或者 +得到通知说该节点没有空闲内存。例如,当一个子系统分配每个CPU的内存资源时,通常是这种情况。 + +一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的 +节点ID,然后只从返回的节点ID请求内存。当这样的分配失败时,请求的子系统可以恢复到它自己的回退 +路径。板块内核内存分配器就是这样的一个例子。或者,子系统可以选择在分配失败时禁用或不启用自己。 +内核分析子系统就是这样的一个例子。 + +如果架构支持——不隐藏无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些 +子系统如果试图完全从无内存的节点分配内存,将无法初始化。为了透明地支持这种架构,内核子系统可 +以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样,这是同 +一个节点,默认的本地页分配将从这个节点开始尝试。 diff --git a/Documentation/translations/zh_CN/mm/overcommit-accounting.rst b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst new file mode 100644 index 000000000000..d8452d8b7fbb --- /dev/null +++ b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst @@ -0,0 +1,86 @@ +:Original: Documentation/mm/overcommit-accounting.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + + +============== +超量使用审计 +============== + +Linux内核支持下列超量使用处理模式 + +0 + 启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。 + 它确保严重的疯狂分配失败,同时允许超量使用以减少swap的使用。在这种模式下, + 允许root分配稍多的内存。这是默认的。 +1 + 总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码,只是依赖 + 几乎完全由零页组成的虚拟内存 + +2 + 不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量 + (默认为50%)。根据你使用的数量,在大多数情况下,这意味着一个进程在访问页面时 + 不会被杀死,但会在内存分配上收到相应的错误。 + + 对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说 + 是很有用的。 + +超量使用策略是通过sysctl `vm.overcommit_memory` 设置的。 + +可以通过 `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (绝对值) +来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。 + +在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前 +的超量使用和提交量。 + +陷阱 +==== + +C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证,并在接近边缘的地方运行, +你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说,这并 +不重要,但如果你真的非常关心的话,这就是一个值得关注的案例。 + + +在模式2中,MAP_NORESERVE标志被忽略。 + + +它是如何工作的 +============== + +超量使用是基于以下规则 + +对于文件映射 + | SHARED or READ-only - 0 cost (该文件是映射而不是交换) + | PRIVATE WRITABLE - 每个实例的映射大小 + +对于匿名或者 ``/dev/zero`` 映射 + | SHARED - 映射的大小 + | PRIVATE READ-only - 0 cost (但作用不大) + | PRIVATE WRITABLE - 每个实例的映射大小 + +额外的计数 + | 通过mmap制作可写副本的页面 + | 从同一池中提取的shmfs内存 + +状态 +==== + +* 我们核算mmap内存映射 +* 我们核算mprotect在提交中的变化 +* 我们核算mremap的大小变化 +* 我们的审计 brk +* 审计munmap +* 我们在/proc中报告commit 状态 +* 核对并检查分叉的情况 +* 审查堆栈处理/执行中的构建 +* 叙述SHMfs的情况 +* 实现实际限制的执行 + +待续 +==== +* ptrace 页计数(这很难)。 diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst new file mode 100644 index 000000000000..320952ca93af --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_frags.rst @@ -0,0 +1,38 @@ +:Original: Documentation/mm/page_frag.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +页面片段 +======== + +一个页面片段是一个任意长度的任意偏移的内存区域,它位于一个0或更高阶的复合页面中。 +该页中的多个碎片在该页的引用计数器中被单独计算。 + +page_frag函数,page_frag_alloc和page_frag_free,为页面片段提供了一个简单 +的分配框架。这被网络堆栈和网络设备驱动使用,以提供一个内存的支持区域,作为 +sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 + +为了使用页面片段API,需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点, +并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用, +这在分配时开销可能会很大。然而,由于这种缓存的性质,要求任何对缓存的调用都要受到每 +个CPU的限制,或者每个CPU的限制,并在执行碎片分配时强制禁止中断。 + +网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 +netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache +被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 +它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 +将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 + +许多网络设备驱动程序使用类似的方法来分配页面片段,但页面片段是在环或描述符级别上 +缓存的。为了实现这些情况,有必要提供一种拆解页面缓存的通用方法。出于这个原因, +__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。 +这样做的好处是,它允许清理被添加到一个页面的多个引用,以避免每次分配都调用 +get_page。 + +Alexander Duyck,2016年11月29日。 diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst new file mode 100644 index 000000000000..03d9e613094a --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -0,0 +1,116 @@ +:Original: Documentation/mm/page_owner.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +================================ +page owner: 跟踪谁分配的每个页面 +================================ + +概述 +==== + +page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。 +当分配发生时,有关分配的信息,如调用堆栈和页面的顺序被存储到每个页面的特定存储中。 +当我们需要了解所有页面的状态时,我们可以获得并分析这些信息。 + +尽管我们已经有了追踪页面分配/释放的tracepoint,但用它来分析谁分配的每个页面是 +相当复杂的。我们需要扩大跟踪缓冲区,以防止在用户空间程序启动前出现重叠。而且,启 +动的程序会不断地将跟踪缓冲区转出,供以后分析,这将会改变系统的行为,会产生更多的 +可能性,而不是仅仅保留在内存中,所以不利于调试。 + +页面所有者也可以用于各种目的。例如,可以通过每个页面的gfp标志信息获得精确的碎片 +统计。如果启用了page owner,它就已经实现并激活了。我们非常欢迎其他用途。 + +page owner在默认情况下是禁用的。所以,如果你想使用它,你需要在你的启动cmdline +中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有启用启动 +选项而在运行时禁用page owner,那么运行时的开销是很小的。如果在运行时禁用,它不 +需要内存来存储所有者信息,所以没有运行时内存开销。而且,页面所有者在页面分配器的 +热路径中只插入了两个不可能的分支,如果不启用,那么分配就会像没有页面所有者的内核 +一样进行。这两个不可能的分支应该不会影响到分配的性能,特别是在静态键跳转标签修补 +功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。 + +- 没有page owner:: + + text data bss dec hex filename + 48392 2333 644 51369 c8a9 mm/page_alloc.o + +- 有page owner:: + + text data bss dec hex filename + 48800 2445 644 51889 cab1 mm/page_alloc.o + 6662 108 29 6799 1a8f mm/page_owner.o + 1025 8 8 1041 411 mm/page_ext.o + +虽然总共增加了8KB的代码,但page_alloc.o增加了520字节,其中不到一半是在hotpath +中。构建带有page owner的内核,并在需要时打开它,将是调试内核内存问题的最佳选择。 + +有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这 +个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些,所以,在初始化 +之前,许多页面可以被分配,但它们没有所有者信息。为了解决这个问题,这些早期分配的 +页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息,但至 +少,我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上,有13343 +个早期分配的页面被捕捉和标记,尽管它们大部分是由结构页扩展功能分配的。总之,在这 +之后,没有任何页面处于未追踪状态。 + +使用方法 +======== + +1) 构建用户空间的帮助:: + + cd tools/vm + make page_owner_sort + +2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. + +3) 做你想调试的工作。 + +4) 分析来自页面所有者的信息:: + + cat /sys/kernel/debug/page_owner > page_owner_full.txt + ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + + ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 + 取页序值,计算buf的次数和页数,最后根据参数进行排序。 + + 在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + 默认情况下, ``page_owner_sort`` 是根据buf的时间来排序的。如果你想 + 按buf的页数排序,请使用-m参数。详细的参数是: + + 基本函数: + + Sort: + -a 按内存分配时间排序 + -m 按总内存排序 + -p 按pid排序。 + -P 按tgid排序。 + -r 按内存释放时间排序。 + -s 按堆栈跟踪排序。 + -t 按时间排序(默认)。 + + 其它函数: + + Cull: + -c 通过比较堆栈跟踪而不是总块来进行剔除。 + + Filter: + -f 过滤掉内存已被释放的块的信息。 diff --git a/Documentation/translations/zh_CN/mm/page_table_check.rst b/Documentation/translations/zh_CN/mm/page_table_check.rst new file mode 100644 index 000000000000..e8077310a76c --- /dev/null +++ b/Documentation/translations/zh_CN/mm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Original: Documentation/mm/page_table_check.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +======== +页表检查 +======== + +概述 +==== + +页表检查允许通过确保防止某些类型的内存损坏来强化内核。 + +当新的页面可以从用户空间访问时,页表检查通过将它们的页表项(PTEs PMD等)添加到页表中来执行额外 +的验证。 + +在检测到损坏的情况下,内核会被崩溃。页表检查有一个小的性能和内存开销。因此,它在默认情况下是禁用 +的,但是在额外的加固超过性能成本的系统上,可以选择启用。另外,由于页表检查是同步的,它可以帮助调 +试双映射内存损坏问题,在错误的映射发生时崩溃内核,而不是在内存损坏错误发生后内核崩溃。 + +双重映射检测逻辑 +================ + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +启用页表检查 +============ + +用以下方法构建内核: + +- PAGE_TABLE_CHECK=y + 注意,它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。 + +- 使用 "page_table_check=on" 内核参数启动。 + +可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核,以便在没有额外的内核参数的情况下获得页表 +支持。 diff --git a/Documentation/translations/zh_CN/mm/remap_file_pages.rst b/Documentation/translations/zh_CN/mm/remap_file_pages.rst new file mode 100644 index 000000000000..31e0c54dc36f --- /dev/null +++ b/Documentation/translations/zh_CN/mm/remap_file_pages.rst @@ -0,0 +1,32 @@ +:Original: Documentation/mm/remap_file_pages.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +============================== +remap_file_pages()系统调用 +============================== + +remap_file_pages()系统调用被用来创建一个非线性映射,也就是说,在这个映射中, +文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好 +处是,前者不需要内核创建额外的VMA(虚拟内存区)数据结构。 + +支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码,包括热 +路径。另外,为了使非线性映射工作,内核需要一种方法来区分正常的页表项和带有文件 +偏移的项(pte_file)。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资 +源,特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。 + +幸运的是,在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS +实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。 +由于64位系统的广泛使用,这种使用情况已经不重要了。 + +syscall被废弃了,现在用一个模拟来代替它。仿真会创建新的VMA,而不是非线性映射。 +对于remap_file_pages()的少数用户来说,它的工作速度会变慢,但ABI被保留了。 + +仿真的一个副作用(除了性能之外)是,由于额外的VMA,用户可以更容易达到 +vm.max_map_count的限制。关于限制的更多细节,请参见DEFAULT_MAX_MAP_COUNT +的注释。 diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst new file mode 100644 index 000000000000..4fb7aa666037 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst @@ -0,0 +1,96 @@ +:Original: Documentation/mm/split_page_table_lock.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +================================= +分页表锁(split page table lock) +================================= + +最初,mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方 +法导致了多线程应用程序的缺页异常可扩展性差,因为对锁的争夺很激烈。为了提高可扩 +展性,我们引入了分页表锁。 + +有了分页表锁,我们就有了单独的每张表锁来顺序化对表的访问。目前,我们对PTE和 +PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。 + +有一些辅助工具来锁定/解锁一个表和其他访问器函数: + + - pte_offset_map_lock() + 映射pte并获取PTE表锁,返回所取锁的指针; + - pte_unmap_unlock() + 解锁和解映射PTE表; + - pte_alloc_map_lock() + 如果需要的话,分配PTE表并获取锁,如果分配失败,返回已获取的锁的指针 + 或NULL; + - pte_lockptr() + 返回指向PTE表锁的指针; + - pmd_lock() + 取得PMD表锁,返回所取锁的指针。 + - pmd_lockptr() + 返回指向PMD表锁的指针; + +如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)小于或等于NR_CPUS,则在编译 +时启用PTE表的分页表锁。如果分页锁被禁用,所有的表都由mm->page_table_lock +来保护。 + +如果PMD表启用了分页锁,并且架构支持它,那么PMD表的分页锁就会被启用(见 +下文)。 + +Hugetlb 和分页表锁 +================== + +Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁,但不对PUD使用。 + +Hugetlb特定的辅助函数: + + - huge_pte_lock() + 对PMD_SIZE页面采取pmd分割锁,否则mm->page_table_lock; + - huge_pte_lockptr() + 返回指向表锁的指针。 + +架构对分页表锁的支持 +==================== + +没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() +和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 + +确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 +面。这个区域与page->ptl共享存储。 + +PMD分页锁只有在你有两个以上的页表级别时才有意义。 + +启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 +用pgtable_pmd_page_dtor()。 + +分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() +中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 +分配一些PMD。 + +一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 + +注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 +须正确处理。 + +page->ptl +========= + +page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struct page。它 +与page->private(以及union中的其他几个字段)共享存储。 + +为了避免增加struct page的大小并获得最佳性能,我们使用了一个技巧: + + - 如果spinlock_t适合于long,我们使用page->ptr作为spinlock,这样我们 + 就可以避免间接访问并节省一个缓存行。 + - 如果spinlock_t的大小大于long的大小,我们使用page->ptl作为spinlock_t + 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 + 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 + +PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t +分配在pgtable_pmd_page_ctor()中。 + +请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/Documentation/translations/zh_CN/mm/z3fold.rst b/Documentation/translations/zh_CN/mm/z3fold.rst new file mode 100644 index 000000000000..9569a6d88270 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/z3fold.rst @@ -0,0 +1,31 @@ +:Original: Documentation/mm/z3fold.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + + +====== +z3fold +====== + +z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。 +它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。 + +z3fold和zbud的主要区别是: + +* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。 +* z3fold在其页面中最多可以容纳3个压缩页面 +* z3fold本身没有输出任何API,因此打算通过zpool的API来使用 + +为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是 +它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩 +率是1.7倍左右。 + +不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它 +返回一个无符号长句柄,它编码了被分配对象的实际位置。 + +保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行 +为,这使得它更适合于小型和反应迅速的系统。 diff --git a/Documentation/translations/zh_CN/mm/zsmalloc.rst b/Documentation/translations/zh_CN/mm/zsmalloc.rst new file mode 100644 index 000000000000..b5596ea08ae4 --- /dev/null +++ b/Documentation/translations/zh_CN/mm/zsmalloc.rst @@ -0,0 +1,78 @@ +:Original: Documentation/mm/zs_malloc.rst + +:翻译: + + 司延腾 Yanteng Si + +:校译: + +======== +zsmalloc +======== + +这个分配器是为与zram一起使用而设计的。因此,该分配器应该在低内存条件下工作良好。特别是, +它从未尝试过higher order页面的分配,这在内存压力下很可能会失败。另一方面,如果我们只 +是使用单(0-order)页,它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将 +占据整个页面。这是其前身(xvmalloc)的主要问题之一。 + +为了克服这些问题,zsmalloc分配了一堆0-order页面,并使用各种"struct page"字段将它 +们链接起来。这些链接的页面作为一个单一的higher order页面,即一个对象可以跨越0-order +页面的边界。代码将这些链接的页面作为一个实体,称为zspage。 + +为了简单起见,zsmalloc只能分配大小不超过PAGE_SIZE的对象,因为这满足了所有当前用户的 +要求(在最坏的情况下,页面是不可压缩的,因此以"原样"即未压缩的形式存储)。对于大于这 +个大小的分配请求,会返回失败(见zs_malloc)。 + +此外,zs_malloc()并不返回一个可重复引用的指针。相反,它返回一个不透明的句柄(无符号 +长),它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久 +映射,因为这在32位系统上会导致问题,因为内核空间映射的VA区域非常小。因此,在使用分配 +的内存之前,对象必须使用zs_map_object()进行映射以获得一个可用的指针,随后使用 +zs_unmap_object()解除映射。 + +stat +==== + +通过CONFIG_ZSMALLOC_STAT,我们可以通过 ``/sys/kernel/debug/zsmalloc/`` +看到zsmalloc内部信息。下面是一个统计输出的例子。:: + + # cat /sys/kernel/debug/zsmalloc/zram0/classes + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage + ... + ... + 9 176 0 1 186 129 8 4 + 10 192 1 0 2880 2872 135 3 + 11 208 0 1 819 795 42 2 + 12 224 0 1 219 159 12 4 + ... + ... + + +class + 索引 +size + zspage存储对象大小 +almost_empty + ZS_ALMOST_EMPTY zspage的数量(见下文)。 +almost_full + ZS_ALMOST_FULL zspage的数量(见下图) +obj_allocated + 已分配对象的数量 +obj_used + 分配给用户的对象的数量 +pages_used + 为该类分配的页数 +pages_per_zspage + 组成一个zspage的0-order页面的数量 + +当n <= N / f时,我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组,其中 + +* n = 已分配对象的数量 +* N = zspage可以存储的对象总数 +* f = fullness_threshold_frac(即,目前是4个) + +同样地,我们将zspage分配给: + +* ZS_ALMOST_FULL when n > N / f +* ZS_EMPTY when n == 0 +* ZS_FULL when n == N diff --git a/Documentation/translations/zh_CN/vm/active_mm.rst b/Documentation/translations/zh_CN/vm/active_mm.rst deleted file mode 100644 index 366609ea4f37..000000000000 --- a/Documentation/translations/zh_CN/vm/active_mm.rst +++ /dev/null @@ -1,85 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/active_mm.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -========= -Active MM -========= - -这是一封linux之父回复开发者的一封邮件,所以翻译时我尽量保持邮件格式的完整。 - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - 因为我并不经常写解释,所以已经抄送到linux-kernel邮件列表,而当我做这些, - 且更多的人在阅读它们时,我觉得棒极了。 - - 1999年7月30日 星期五, David Mosberger 写道: - > - > 是否有一个简短的描述,说明task_struct中的 - > "mm" 和 "active_mm"应该如何使用? (如果 - > 这个问题在邮件列表中讨论过,我表示歉意--我刚 - > 刚度假回来,有一段时间没能关注linux-kernel了)。 - - 基本上,新的设定是: - - - 我们有“真实地址空间”和“匿名地址空间”。区别在于,匿名地址空间根本不关心用 - 户级页表,所以当我们做上下文切换到匿名地址空间时,我们只是让以前的地址空间 - 处于活动状态。 - - 一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线 - 程基本上都属于这一类,但即使是“真正的”线程也可以暂时说在一定时间内它们不 - 会对用户空间感兴趣,调度器不妨试着避免在切换VM状态上浪费时间。目前只有老 - 式的bdflush sync能做到这一点。 - - - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说,tsk->mm将是NULL, - 其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。 - - - 然而,我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此,我们 - 有 “tsk->active_mm”,它显示了当前活动的地址空间是什么。 - - 规则是,对于一个有真实地址空间的进程(即tsk->mm是 non-NULL),active_mm - 显然必须与真实的mm相同。 - - 对于一个匿名进程,tsk->mm == NULL,而tsk->active_mm是匿名进程运行时 - “借用”的mm。当匿名进程被调度走时,借用的地址空间被返回并清除。 - - 为了支持所有这些,“struct mm_struct”现在有两个计数器:一个是 “mm_users” - 计数器,即有多少 “真正的地址空间用户”,另一个是 “mm_count”计数器,即 “lazy” - 用户(即匿名用户)的数量,如果有任何真正的用户,则加1。 - - 通常情况下,至少有一个真正的用户,但也可能是真正的用户在另一个CPU上退出,而 - 一个lazy的用户仍在活动,所以你实际上得到的情况是,你有一个地址空间 **只** - 被lazy的用户使用。这通常是一个短暂的生命周期状态,因为一旦这个线程被安排给一 - 个真正的线程,这个 “僵尸” mm就会被释放,因为 “mm_count”变成了零。 - - 另外,一个新的规则是,**没有人** 再把 “init_mm” 作为一个真正的MM了。 - “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”,事实上,它主 - 要是在启动时使用,当时还没有真正的VM被创建。因此,用来检查的代码 - - if (current->mm == &init_mm) - - 一般来说,应该用 - - if (!current->mm) - - 取代上面的写法(这更有意义--测试基本上是 “我们是否有一个用户环境”,并且通常 - 由缺页异常处理程序和类似的东西来完成)。 - - 总之,我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1,因为它稍微改 - 变了接口以适配alpha(谁会想到呢,但alpha体系结构上下文切换代码实际上最终是 - 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的,alpha的PALcode将两者 - 连接起来,你需要同时切换两者)。 - - (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/translations/zh_CN/vm/balance.rst b/Documentation/translations/zh_CN/vm/balance.rst deleted file mode 100644 index e98a47ef24a8..000000000000 --- a/Documentation/translations/zh_CN/vm/balance.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/balance.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -内存平衡 -======== - -2000年1月开始,作者:Kanoj Sarcar - -对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配,需要进行 -内存平衡。 - -调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个 -原因可能是,调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退 -选项的机会主义高阶分配请求中。在这种情况下,调用者可能也希望避免唤醒kswapd。 - -__GFP_IO分配请求是为了防止文件系统死锁。 - -在没有非睡眠分配请求的情况下,做平衡似乎是有害的。页面回收可以被懒散地启动,也就是 -说,只有在需要的时候(也就是区域的空闲内存为0),而不是让它成为一个主动的过程。 - -也就是说,内核应该尝试从直接映射池中满足对直接映射页的请求,而不是回退到dma池中, -这样就可以保持dma池为dma请求(不管是不是原子的)所填充。类似的争论也适用于高内存 -和直接映射的页面。相反,如果有很多空闲的dma页,最好是通过从dma池中分配一个来满足 -常规的内存请求,而不是产生常规区域平衡的开销。 - -在2.2中,只有当空闲页总数低于总内存的1/64时,才会启动内存平衡/页面回收。如果dma -和常规内存的比例合适,即使dma区完全空了,也很可能不会进行平衡。2.2已经在不同内存 -大小的生产机器上运行,即使有这个问题存在,似乎也做得不错。在2.3中,由于HIGHMEM的 -存在,这个问题变得更加严重。 - -在2.3中,区域平衡可以用两种方式之一来完成:根据区域的大小(可能是低级区域的大小), -我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是,在平衡的时 -候,我们不需要看低级区的大小,坏的方面是,我们可能会因为忽略低级区可能较低的使用率 -而做过于频繁的平衡。另外,只要对分配程序稍作修改,就有可能将memclass()宏简化为一 -个简单的等式。 - -另一个可能的解决方案是,我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其 -低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题,并尽可能地保持了与2.2行为 -的接近。另外,平衡算法在各种架构上的工作方式也是一样的,这些架构有不同数量和类型的 -内存区。如果我们想变得更花哨一点,我们可以在未来为不同区域的自由页面分配不同的权重。 - -请注意,如果普通区的大小与dma区相比是巨大的,那么在决定是否平衡普通区的时候,考虑 -空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。 - -所附的补丁实现了第二个解决方案。它还 “修复”了两个问题:首先,在低内存条件下,kswapd -被唤醒,就像2.2中的非睡眠分配。第二,HIGHMEM区也被平衡了,以便给replace_with_highmem() -一个争取获得HIGHMEM页的机会,同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM -页不会被泄露(例如,在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下)。 - -kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的,可能 -是因为所有的分配请求都来自中断上下文,而所有的进程上下文都在睡眠。对于2.3, -kswapd并不真正需要平衡高内存区,因为中断上下文并不请求高内存页。kswapd看zone -结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。 - -如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力,而该区的内存压力 -已经低于其水位,则会进行偷取。 - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: -这些是每个区的字段,用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时, -hysteric 的字段low_on_memory被设置。这个字段会一直被设置,直到空闲页数变成水位 -[WMARK_HIGH]。当low_on_memory被设置时,页面分配请求将尝试释放该区域的一些页面(如果 -请求中设置了GFP_WAIT)。与此相反的是,决定唤醒kswapd以释放一些区的页。这个决定不是基于 -hysteresis 的,而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行;在这种情况下, -zone_wake_kswapd也被设置。 - - -我所听到的(超棒的)想法: - -1. 动态经历应该影响平衡:可以跟踪一个区的失败请求的数量,并反馈到平衡方案中(jalvo@mbay.net)。 - -2. 实现一个类似于replace_with_highmem()的replace_with_regular(),以保留dma页面。 - (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/translations/zh_CN/vm/damon/api.rst b/Documentation/translations/zh_CN/vm/damon/api.rst deleted file mode 100644 index 21143eea4ebe..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/api.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/api.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======= -API参考 -======= - -内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` , -它位于源代码树的include/linux/。 - -结构体 -====== - -该API在以下内核代码中: - -include/linux/damon.h - - -函数 -==== - -该API在以下内核代码中: - -mm/damon/core.c diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/vm/damon/design.rst deleted file mode 100644 index 46128b77c2b3..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/design.rst +++ /dev/null @@ -1,140 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/design.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -==== -设计 -==== - -可配置的层 -========== - -DAMON提供了数据访问监控功能,同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间 -并为之优化的基元。另一方面,作为DAMON的核心,准确性和开销的权衡机制是在纯逻辑空间中。DAMON -将这两部分分离在不同的层中,并定义了它的接口,以允许各种低层次的基元实现与核心逻辑的配置。 - -由于这种分离的设计和可配置的接口,用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的 -任何地址空间。如果没有提供合适的,用户可以自己实现基元。 - -例如,物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。 -另外,如果某些架构或设备支持特殊的优化访问检查基元,这些基元将很容易被配置。 - - -特定地址空间基元的参考实现 -========================== - -基本访问监测的低级基元被定义为两部分。: - -1. 确定地址空间的监测目标地址范围 -2. 目标空间中特定地址范围的访问检查。 - -DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。 - - -基于VMA的目标地址范围构造 -------------------------- - -这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间,只是要求用户手动设置监控目标地址范围。 - -在进程的超级巨大的虚拟地址空间中,只有小部分被映射到物理内存并被访问。因此,跟踪未映射的地 -址区域只是一种浪费。然而,由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声,所以严 -格来说,跟踪每一个映射并不是必须的,但在某些情况下甚至会产生很高的开销。也就是说,监测目标 -内部过于巨大的未映射区域应该被移除,以不占用自适应机制的时间。 - -出于这个原因,这个实现将复杂的映射转换为三个不同的区域,覆盖地址空间的每个映射区域。这三个 -区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面 -的mmap()区域之间的间隙,以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙 -在通常的地址空间中是异常巨大的,排除这些间隙就足以做出合理的权衡。下面详细说明了这一点:: - - - - - (small mmap()-ed regions and munmap()-ed regions) - - - - - -基于PTE访问位的访问检查 ------------------------ - -物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中 -找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表,而物理地址的实现则 -是查找与该地址有映射关系的每一个页表。通过这种方式,实现者找到并清除下一个采样目标地址的位, -并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统,即空闲页跟 -踪和回收逻辑。为了避免这种干扰,DAMON使其与空闲页面跟踪相互排斥,并使用 ``PG_idle`` 和 -``PG_young`` 页面标志来解决与回收逻辑的冲突,就像空闲页面跟踪那样。 - - -独立于地址空间的核心机制 -======================== - -下面四个部分分别描述了DAMON的核心机制和五个监测属性,即 ``采样间隔`` 、 ``聚集间隔`` 、 -``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。 - - -访问频率监测 ------------- - -DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置 -``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说,DAMON检查每个 ``采样间隔`` 对每 -个页面的访问,并将结果汇总。换句话说,计算每个页面的访问次数。在每个 ``聚合间隔`` 过 -去后,DAMON调用先前由用户注册的回调函数,以便用户可以阅读聚合的结果,然后再清除这些结 -果。这可以用以下简单的伪代码来描述:: - - while monitoring_on: - for page in monitoring_target: - if accessed(page): - nr_accesses[page] += 1 - if time() % aggregation_interval == 0: - for callback in user_registered_callbacks: - callback(monitoring_target, nr_accesses) - for page in monitoring_target: - nr_accesses[page] = 0 - sleep(sampling interval) - -这种机制的监测开销将随着目标工作负载规模的增长而任意增加。 - - -基于区域的抽样调查 ------------------- - -为了避免开销的无限制增加,DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持 -这个假设(一个区域内的页面具有相同的访问频率),该区域内就只需要检查一个页面。因此,对 -于每个 ``采样间隔`` ,DAMON在每个区域中随机挑选一个页面,等待一个 ``采样间隔`` ,检 -查该页面是否同时被访问,如果被访问则增加该区域的访问频率。因此,监测开销是可以通过设置 -区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。 - -然而,如果假设没有得到保证,这个方案就不能保持输出的质量。 - - -适应性区域调整 --------------- - -即使最初的监测目标区域被很好地构建以满足假设(同一区域内的页面具有相似的访问频率),数 -据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设,DAMON根据每个 -区域的访问频率自适应地进行合并和拆分。 - -对于每个 ``聚集区间`` ,它比较相邻区域的访问频率,如果频率差异较小,就合并这些区域。 -然后,在它报告并清除每个区域的聚合接入频率后,如果区域总数不超过用户指定的最大区域数, -它将每个区域拆分为两个或三个区域。 - -通过这种方式,DAMON提供了其最佳的质量和最小的开销,同时保持了用户为其权衡设定的界限。 - - -动态目标空间更新处理 --------------------- - -监测目标地址范围可以动态改变。例如,虚拟内存可以动态地被映射和解映射。物理内存可以被 -热插拔。 - -由于在某些情况下变化可能相当频繁,DAMON允许监控操作检查动态变化,包括内存映射变化, -并仅在用户指定的时间间隔( ``更新间隔`` )中的每个时间段,将其应用于监控操作相关的 -数据结构,如抽象的监控目标内存区。 \ No newline at end of file diff --git a/Documentation/translations/zh_CN/vm/damon/faq.rst b/Documentation/translations/zh_CN/vm/damon/faq.rst deleted file mode 100644 index 07b4ac19407d..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/faq.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/faq.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -常见问题 -======== - -为什么是一个新的子系统,而不是扩展perf或其他用户空间工具? -========================================================== - -首先,因为它需要尽可能的轻量级,以便可以在线使用,所以应该避免任何不必要的开销,如内核-用户 -空间的上下文切换成本。第二,DAMON的目标是被包括内核在内的其他程序所使用。因此,对特定工具 -(如perf)的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。 - - -“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗? -============================================== - -闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的,尽管它可以 -使用采样来减少开销。另一方面,DAMON是一个更高层次的框架,用于监控各种地址空间。它专注于内 -存管理优化,并提供复杂的精度/开销处理机制。因此,“空闲页面跟踪” 和 “perf mem” 可以提供 -DAMON输出的一个子集,但不能替代DAMON。 - - -DAMON是否只支持虚拟内存? -========================= - -不,DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始 -部分,包括监测目标区域的构造和实际的访问检查。通过这种方式,DAMON用户可以用任何访问检查技 -术来监测任何地址空间。 - -尽管如此,DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间 -相关功能的实现,以供参考和方便使用。 - - -我可以简单地监测页面的粒度吗? -============================== - -是的,你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。 -因为监视目标区域的大小被强制为 ``>=page size`` ,所以区域分割不会产生任何影响。 diff --git a/Documentation/translations/zh_CN/vm/damon/index.rst b/Documentation/translations/zh_CN/vm/damon/index.rst deleted file mode 100644 index 84d36d90c9b0..000000000000 --- a/Documentation/translations/zh_CN/vm/damon/index.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/damon/index.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -========================== -DAMON:数据访问监视器 -========================== - -DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 -(该核心机制详见(Documentation/translations/zh_CN/vm/damon/design.rst)) - - - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), - - *轻量级* (监控开销低到可以在线应用),以及 - - *可扩展* (无论目标工作负载的大小,开销的上限值都在恒定范围内)。 - -因此,利用这个框架,内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实 -验性内存管理优化工作可以再次进行。同时,在用户空间,有一些特殊工作负载的用户可以编写 -个性化的应用程序,以便更好地了解和优化他们的工作负载和系统。 - -.. toctree:: - :maxdepth: 2 - - faq - design - api - diff --git a/Documentation/translations/zh_CN/vm/free_page_reporting.rst b/Documentation/translations/zh_CN/vm/free_page_reporting.rst deleted file mode 100644 index 31d6c34b956b..000000000000 --- a/Documentation/translations/zh_CN/vm/free_page_reporting.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/_free_page_reporting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========== -空闲页报告 -========== - -空闲页报告是一个API,设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟 -化的情况下是很有用的,客户机能够使用这些数据来通知管理器它不再使用内存中的某些页 -面。 - -对于驱动,通常是气球驱动要使用这个功能,它将分配和初始化一个page_reporting_dev_info -结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必 -须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。 -假设没有其他页面报告设备已经注册, 对page_reporting_register的调用将向报告框 -架注册页面报告接口。 - -一旦注册,页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告 -页面,并在任何足够高的页面被释放之后2秒继续报告。 - -报告的页面将被存储在传递给报告函数的散列表中,最后一个条目的结束位被设置在条目 -nent-1中。 当页面被报告函数处理时,分配器将无法访问它们。一旦报告函数完成,这些 -页将被返回到它们所获得的自由区域。 - -在移除使用空闲页报告的驱动之前,有必要调用page_reporting_unregister,以移除 -目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报 -告通过该接口发出。如果另一个驱动或同一驱动被注册,它就有可能恢复前一个驱动在报告 -空闲页方面的工作。 - - -Alexander Duyck, 2019年12月04日 diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/vm/frontswap.rst deleted file mode 100644 index 3eb07870e2ef..000000000000 --- a/Documentation/translations/zh_CN/vm/frontswap.rst +++ /dev/null @@ -1,196 +0,0 @@ -:Original: Documentation/vm/_free_page_reporting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========= -Frontswap -========= - -Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 -于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 -节省(提高)。 - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 -储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory -(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 -求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 -调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops -的功能,它提供的功能必须符合某些策略,如下所示: - -一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap -交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 -量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 -内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 -transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 -相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 - -一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 -要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 -经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, -也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 -可以像往常一样被写入交换空间。 - -请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” -的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 -不会从frontswap中获得。 - -如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 -debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): - -``failed_stores`` - 有多少次存储的尝试是失败的 - -``loads`` - 尝试了多少次加载(应该全部成功) - -``succ_stores`` - 有多少次存储的尝试是成功的 - -``invalidates`` - 尝试了多少次作废 - -后台实现可以提供额外的指标。 - -经常问到的问题 -============== - -* 价值在哪里? - -当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 -读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 -能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 -移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 -页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 - -Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 -灵活性: - -在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 -全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 -用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 -工作负载上,则有明显的性能改善(25%以上)。 - -“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory -的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 -统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 -交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 -可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 -有多少内存可用 - -在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 -用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 -是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory -后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, -而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 -的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap -允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), -从而减轻计划外交换可能带来的可怕的性能影响。 - -一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 -内存扩展技术的调查也在进行中。 - -* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? - -如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 -个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 -frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 -是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 -后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 -忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 -的,无论如何使用一小部分的 CPU 都是不相关的。 - -至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 -每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 -页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 -会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 -非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 - -当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 -产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 -来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 - -* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? - -我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 -明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 -供了多少内存是完全动态和随机的。 - -每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 -型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 -换页的尝试。 - -每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 -frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 -间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap -backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 -页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 -据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 -下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 -页面偏移量,否则它就会将数据写入该设备。 - -当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), -检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 -的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, -以便从真正的交换设备上获得这一页的数据。 - -所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 -入都被 “frontswap backend store” 和(可能)“frontswap backend loads” -所取代,这可能会快得多。 - -* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 - 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? - -首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 -结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 -假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 -及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 - -例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend -的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 -先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 -以根据当前的内存限制动态地定义。 - -此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 -块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 -不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 -态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 -复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 -的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” -线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, -KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 - -在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 -例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 -自己的交换。 - -transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 -能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, -frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 -容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 -swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap -就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 -backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 - - -* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, - 难道它不能总是被成功地覆盖吗? - -几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 -缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 -backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap -拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 -换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 - -* 为什么frontswap补丁会创建新的头文件swapfile.h? - -frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 -在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 -个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 - -Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/vm/highmem.rst b/Documentation/translations/zh_CN/vm/highmem.rst deleted file mode 100644 index 018838e58c3e..000000000000 --- a/Documentation/translations/zh_CN/vm/highmem.rst +++ /dev/null @@ -1,128 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/highmem.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========== -高内存处理 -========== - -作者: Peter Zijlstra - -.. contents:: :local: - -高内存是什么? -============== - -当物理内存的大小接近或超过虚拟内存的最大大小时,就会使用高内存(highmem)。在这一点上,内 -核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内 -存的临时映射。 - -没有被永久映射覆盖的那部分(物理)内存就是我们所说的 "高内存"。对于这个边界的确切位置,有 -各种架构上的限制。 - -例如,在i386架构中,我们选择将内核映射到每个进程的虚拟空间,这样我们就不必为内核的进入/退 -出付出全部的TLB作废代价。这意味着可用的虚拟内存空间(i386上为4GiB)必须在用户和内核空间之 -间进行划分。 - -使用这种方法的架构的传统分配方式是3:1,3GiB用于用户空间,顶部的1GiB用于内核空间。:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -这意味着内核在任何时候最多可以映射1GiB的物理内存,但是由于我们需要虚拟地址空间来做其他事 -情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少(通常在~896MiB左右)。 - -其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而,一些硬件(如一些ARM)在使 -用mm上下文标签时,其虚拟空间有限。 - - -临时虚拟映射 -============ - -内核包含几种创建临时映射的方法。: - -* vmap(). 这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization - 来解除映射。 - -* kmap(). 这允许对单个页面进行短期映射。它需要synchronization,但在一定程度上被摊销。 - 当以嵌套方式使用时,它也很容易出现死锁,因此不建议在新代码中使用它。 - -* kmap_atomic(). 这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上, - 它表现得很好,但发布任务因此被要求留在该CPU上直到它完成,以免其他任务取代它的映射。 - - kmap_atomic() 也可以由中断上下文使用,因为它不睡眠,而且调用者可能在调用kunmap_atomic() - 之后才睡眠。 - - 可以假设k[un]map_atomic()不会失败。 - - -使用kmap_atomic -=============== - -何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存(见__GFP_HIGHMEM) -分配的页面的内容时,例如在页缓存中的页面,就会使用它。该API有两个函数,它们的使用方式与 -下面类似:: - - /* 找到感兴趣的页面。 */ - struct page *page = find_get_page(mapping, offset); - - /* 获得对该页内容的访问权。 */ - void *vaddr = kmap_atomic(page); - - /* 对该页的内容做一些处理。 */ - memset(vaddr, 0, PAGE_SIZE); - - /* 解除该页面的映射。 */ - kunmap_atomic(vaddr); - -注意,kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。 - -如果你需要映射两个页面,因为你想从一个页面复制到另一个页面,你需要保持kmap_atomic调用严 -格嵌套,如:: - - vaddr1 = kmap_atomic(page1); - vaddr2 = kmap_atomic(page2); - - memcpy(vaddr1, vaddr2, PAGE_SIZE); - - kunmap_atomic(vaddr2); - kunmap_atomic(vaddr1); - - -临时映射的成本 -============== - -创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。 - -如果CONFIG_HIGHMEM没有被设置,那么内核会尝试用一点计算来创建映射,将页面结构地址转换成 -指向页面内容的指针,而不是去捣鼓映射。在这种情况下,解映射操作可能是一个空操作。 - -如果CONFIG_MMU没有被设置,那么就不可能有临时映射和高内存。在这种情况下,也将使用计算方法。 - - -i386 PAE -======== - -在某些情况下,i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果: - -* Linux需要为系统中的每个页面建立一个页帧结构,而且页帧需要驻在永久映射中,这意味着: - -* 你最多可以有896M/sizeof(struct page)页帧;由于页结构体是32字节的,所以最终会有 - 112G的页;然而,内核需要在内存中存储更多的页帧...... - -* PAE使你的页表变大--这使系统变慢,因为更多的数据需要在TLB填充等方面被访问。一个好处 - 是,PAE有更多的PTE位,可以提供像NX和PAT这样的高级功能。 - -一般的建议是,你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作 -量有用,但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。 diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/vm/hmm.rst deleted file mode 100644 index 2379df95aa58..000000000000 --- a/Documentation/translations/zh_CN/vm/hmm.rst +++ /dev/null @@ -1,361 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/hmm.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -================== -异构内存管理 (HMM) -================== - -提供基础设施和帮助程序以将非常规内存(设备内存,如板上 GPU 内存)集成到常规内核路径中,其 -基石是此类内存的专用struct page(请参阅本文档的第 5 至 7 节)。 - -HMM 还为 SVM(共享虚拟内存)提供了可选的帮助程序,即允许设备透明地访问与 CPU 一致的程序 -地址,这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得 -必不可少,其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。 - -本文档分为以下部分:在第一部分中,我揭示了与使用特定于设备的内存分配器相关的问题。在第二 -部分中,我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页 -表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后, -最后一节介绍了一个新的迁移助手,它允许利用设备 DMA 引擎。 - -.. contents:: :local: - -使用特定于设备的内存分配器的问题 -================================ - -具有大量板载内存(几 GB)的设备(如 GPU)历来通过专用驱动程序特定 API 管理其内存。这会 -造成设备驱动程序分配和管理的内存与常规应用程序内存(私有匿名、共享内存或常规文件支持内存) -之间的隔断。从这里开始,我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况: -即,设备可以透明地使用任何应用程序内存区域。 - -分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来 -看,程序中的所有内存对象并不平等,这使得依赖于广泛的库的大型程序变得复杂。 - -具体来说,这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存(malloc、mmap -私有、mmap 共享)和通过设备驱动程序 API 分配的内存之间复制对象(这仍然以 mmap 结束, -但是是设备文件)。 - -对于平面数据集(数组、网格、图像……),这并不难实现,但对于复杂数据集(列表、树……), -很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错, -而且由于数据集和地址的重复,程序更难调试。 - -分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据,因此每个库可能 -不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响,并因为各种内存 -拷贝而浪费资源。 - -复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出,并不是一个可行的选择。 -这将导致库入口点的组合爆炸。 - -最后,随着高级语言结构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有可能在没有程 -序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有 -其他模式,使用共享地址空间也更合理。 - - -I/O 总线、设备内存特性 -====================== - -由于一些限制,I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本 -内存访问;甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下,它 -不是缓存一致的。 - -如果我们只考虑 PCIE 总线,那么设备可以访问主内存(通常通过 IOMMU)并与 CPU 缓存一 -致。但是,它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟:CPU -只能访问有限范围的设备内存,而不能对其执行原子操作。因此,从内核的角度来看,设备内存不 -能被视为与常规内存等同。 - -另一个严重的因素是带宽有限(约 32GBytes/s,PCIE 4.0 和 16 通道)。这比最快的 GPU -内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自 -己的内存时高一个数量级。 - -一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制 -(OpenCAPI、CCIX)。它们主要允许 CPU 和设备之间的双向缓存一致性,并允许架构支持的所 -有原子操作。遗憾的是,并非所有平台都遵循这一趋势,并且一些主要架构没有针对这些问题的硬 -件解决方案。 - -因此,为了使共享地址空间有意义,我们不仅必须允许设备访问任何内存,而且还必须允许任何内 -存在设备使用时迁移到设备内存(在迁移时阻止 CPU 访问)。 - - -共享地址空间和迁移 -================== - -HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间,因此对 -于进程地址空间中的任何有效主内存地址,相同的地址指向相同的物理内存。 - -为了实现这一点,HMM 提供了一组帮助程序来填充设备页表,同时跟踪 CPU 页表更新。设备页表 -更新不像 CPU 页表更新那么容易。要更新设备页表,您必须分配一个缓冲区(或使用预先分配的 -缓冲区池)并在其中写入 GPU 特定命令以执行更新(取消映射、缓存失效和刷新等)。这不能通 -过所有设备的通用代码来完成。因此,为什么HMM提供了帮助器,在把硬件的具体细节留给设备驱 -动程序的同时,把一切可以考虑的因素都考虑进去了。 - -HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存,它允许为设备内存的每个页面分配一个 -struct page。这些页面很特殊,因为 CPU 无法映射它们。然而,它们允许使用现有的迁移机 -制将主内存迁移到设备内存,从 CPU 的角度来看,一切看起来都像是换出到磁盘的页面。使用 -struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次,HMM 仅提供帮助程序, -首先为设备内存热插拔新的 ZONE_DEVICE 内存,然后执行迁移。迁移内容和时间的策略决定留 -给设备驱动程序。 - -请注意,任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如,当支持给定CPU -地址 A 的页面从主内存页面迁移到设备页面时,对地址 A 的任何 CPU 访问都会触发缺页异常 -并启动向主内存的迁移。 - -凭借这两个特性,HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步,而且还通 -过迁移设备正在使用的数据集部分来利用设备内存。 - - -地址空间镜像实现和API -===================== - -地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中;HMM 有助于 -保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier -开始:: - - int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, - struct mm_struct *mm, unsigned long start, - unsigned long length, - const struct mmu_interval_notifier_ops *ops); - -在 ops->invalidate() 回调期间,设备驱动程序必须对范围执行更新操作(将范围标记为只 -读,或完全取消映射等)。设备必须在驱动程序回调返回之前完成更新。 - -当设备驱动程序想要填充一个虚拟地址范围时,它可以使用:: - - int hmm_range_fault(struct hmm_range *range); - -如果请求写访问,它将在丢失或只读条目上触发缺页异常(见下文)。缺页异常使用通用的 mm 缺 -页异常代码路径,就像 CPU 缺页异常一样。 - -这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟 -范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。 - -在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面,以保 -持事物正确同步。使用模式是:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - - range.notifier = &interval_sub; - range.start = ...; - range.end = ...; - range.hmm_pfns = ...; - - if (!mmget_not_zero(interval_sub->notifier.mm)) - return -EFAULT; - - again: - range.notifier_seq = mmu_interval_read_begin(&interval_sub); - mmap_read_lock(mm); - ret = hmm_range_fault(&range); - if (ret) { - mmap_read_unlock(mm); - if (ret == -EBUSY) - goto again; - return ret; - } - mmap_read_unlock(mm); - - take_lock(driver->update); - if (mmu_interval_read_retry(&ni, range.notifier_seq) { - release_lock(driver->update); - goto again; - } - - /* Use pfns array content to update device page table, - * under the update lock */ - - release_lock(driver->update); - return 0; - } - -driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用 -mmu_interval_read_retry() 之前保持,以避免与并发 CPU 页表更新发生任何竞争。 - -利用 default_flags 和 pfn_flags_mask -==================================== - -hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围 -的故障或快照策略,而不必为 pfns 数组中的每个条目设置它们。 - -例如,如果设备驱动程序需要至少具有读取权限的范围的页面,它会设置:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = 0; - -并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。 - -现在假设驱动程序想要做同样的事情,除了它想要拥有写权限的范围内的一页。现在驱动程序设 -置:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = HMM_PFN_REQ_WRITE; - range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; - -有了这个,HMM 将在至少读取(即有效)的所有页面中异常,并且对于地址 -== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限,即,如果 -CPU pte 没有设置写权限,那么HMM将调用handle_mm_fault()。 - -hmm_range_fault 完成后,标志位被设置为页表的当前状态,即 HMM_PFN_VALID | 如果页 -面可写,将设置 HMM_PFN_WRITE。 - - -从核心内核的角度表示和管理设备内存 -================================== - -尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存 -的信息,HMM 将自身挂接到 mm 代码的各个位置,以处理对设备内存支持的地址的任何访问。 -事实证明,这最终复制了 struct page 的大部分字段,并且还需要更新许多内核代码路径才 -能理解这种新的内存类型。 - -大多数内核代码路径从不尝试访问页面后面的内存,而只关心struct page的内容。正因为如此, -HMM 切换到直接使用 struct page 用于设备内存,这使得大多数内核代码路径不知道差异。 -我们只需要确保没有人试图从 CPU 端映射这些页面。 - -移入和移出设备内存 -================== - -由于 CPU 无法直接访问设备内存,因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存 -储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和 -migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。 - -在将页面迁移到设备私有内存之前,需要创建特殊的设备私有 ``struct page`` 。这些将用 -作特殊的“交换”页表条目,以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。 - -这些可以通过以下方式分配和释放:: - - struct resource *res; - struct dev_pagemap pagemap; - - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); - pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; - pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); - - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); - -还有devm_request_free_mem_region(), devm_memremap_pages(), -devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``. - -整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration `) , -但这些步骤分为设备驱动程序特定代码和共享公共代码: - -1. ``mmap_read_lock()`` - - 设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(), - 因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。 - -2. ``migrate_vma_setup(struct migrate_vma *args)`` - - 设备驱动初始化了 ``struct migrate_vma`` 的字段,并将该指针传递给 - migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。 - 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存,设置 - ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页 - 面。如果后者被设置, ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备 - 私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前,只有匿名的私有VMA - 范围可以被迁移到系统内存和设备私有内存。 - - migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()`` - 和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表,使 - 其他设备的MMU无效,以便在 ``args->src`` 数组中填写要迁移的PFN。 - ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` , - 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给 - migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无 - 效化回调,只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。 - - - 在遍历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效 - 的 “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清 - 除它,而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被 - ``lock_page()``锁定,与LRU隔离(如果系统内存和设备私有页不在LRU上),从进 - 程中取消映射,并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup() - 还清除了 ``args->dst`` 数组。 - -3. 设备驱动程序分配目标页面并将源页面复制到目标页面。 - - 驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已 - 设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选 - 择跳过页面迁移。 - - 然后,驱动程序分配一个设备私有 struct page 或一个系统内存页,用 ``lock_page()`` - 锁定该页,并将 ``dst`` 数组条目填入:: - - dst[i] = migrate_pfn(page_to_pfn(dpage)); - - 现在驱动程序知道这个页面正在被迁移,它可以使设备私有 MMU 映射无效并将设备私有 - 内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失 - 效,因此设备驱动程序只需使其自己的 MMU 映射失效。 - - 驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的 - ``struct page`` 面,并将源页面复制到目标设备上,如果指针为 ``NULL`` ,意 - 味着源页面没有被填充到系统内存中,则清除目标设备的私有内存。 - -4. ``migrate_vma_pages()`` - - 这一步是实际“提交”迁移的地方。 - - 如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分配的页会被插 - 入到CPU的页表中。如果一个CPU线程在同一页面上发生异常,这可能会失败。然而,页 - 表被锁定,只有一个新页会被插入。如果它失去了竞争,设备驱动将看到 - ``MIGRATE_PFN_MIGRATE`` 位被清除。 - - 如果源页被锁定、隔离等,源 ``struct page`` 信息现在被复制到目标 - ``struct page`` ,最终完成CPU端的迁移。 - -5. 设备驱动为仍在迁移的页面更新设备MMU页表,回滚未迁移的页面。 - - 如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置,设备驱动可以 - 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` 位被设置,则设置写启用位。 - -6. ``migrate_vma_finalize()`` - - 这一步用新页的页表项替换特殊的迁移页表项,并释放对源和目的 ``struct page`` - 的引用。 - -7. ``mmap_read_unlock()`` - - 现在可以释放锁了。 - -独占访问存储器 -============== - -一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 -个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU -的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 -个内存范围不能从用户空间访问。 - -这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会 -导致一个异常,该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已 -经被MMU通知器改变,之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序 -放弃页面锁和页面引用为止,这时页面上的任何CPU异常都可以按所述进行。 - -内存 cgroup (memcg) 和 rss 统计 -=============================== - -目前,设备内存被视为 rss 计数器中的任何常规页面(如果设备页面用于匿名,则为匿名, -如果设备页面用于文件支持页面,则为文件,如果设备页面用于共享内存,则为 shmem)。 -这是为了保持现有应用程序的故意选择,这些应用程序可能在不知情的情况下开始使用设备 -内存,运行不受影响。 - -一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序, -因此不会释放太多系统内存。在决定以不同方式计算设备内存之前,我们希望收集更多关 -于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。 - -对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算,常规 -页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规 -内存不会失败,因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对 -内存资源控制的影响有了更多的了解,我们可能会在后面重新考虑这个选择。 - -请注意,设备内存永远不能由设备驱动程序或通过 GUP 固定,因此此类内存在进程退出时 -总是被释放的。或者在共享内存或文件支持内存的情况下,当删除最后一个引用时。 diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst deleted file mode 100644 index c6d471ce2131..000000000000 --- a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst +++ /dev/null @@ -1,436 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/hugetlbfs_reserv.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -============== -Hugetlbfs 预留 -============== - -概述 -==== - -:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指 -示要使用巨页,这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常 -时没有巨页存在,任务就会被发送一个SIGBUS,并经常不高兴地死去。在加入巨页支 -持后不久,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果 -没有足够的巨页来覆盖映射,mmap()将失败。这首先是在mmap()时在代码中做一个 -简单的检查,以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一 -样,代码随着时间的推移而不断发展。然而,基本的想法是在mmap()时 “预留” -巨页,以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核 -中是如何进行巨页预留处理的。 - - -读者 -==== -这个描述主要是针对正在修改hugetlbfs代码的内核开发者。 - - -数据结构 -======== - -resv_huge_pages - 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页只对预留它们的任 - 务可用。因此,一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。 -Reserve Map - 预留映射由以下结构体描述:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - 系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的 - 区域。一个区域被描述为:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型,在 - reserv_map 中的一个区域可能表示该范围存在预留,或预留不存在。 -Flags for MAP_PRIVATE Reservations - 这些被存储在预留的映射指针的底部。 - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - 表示该任务是与该映射相关的预留的所有者。 - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - 表示最初映射此范围(并创建储备)的任务由于COW失败而从该任务(子任务)中取消映 - 射了一个页面。 -Page Flags - PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在 - “释放巨页” 一节中讨论。 - - -预留映射位置(私有或共享) -========================== - -一个巨页映射或段要么是私有的,要么是共享的。如果是私有的,它通常只对一个地址空间 -(任务)可用。如果是共享的,它可以被映射到多个地址空间(任务)。对于这两种类型的映射, -预留映射的位置和语义是明显不同的。位置的差异是: - -- 对于私有映射,预留映射挂在VMA结构体上。具体来说,就是vma->vm_private_data。这个保 - 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。 -- 对于共享映射,预留映射挂在inode上。具体来说,就是inode->i_mapping->private_data。 - 由于共享映射总是由hugetlbfs文件系统中的文件支持,hugetlbfs代码确保每个节点包含一个预 - 留映射。因此,预留映射在创建节点时被分配。 - - -创建预留 -======== -当创建一个巨大的有页面支持的共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB) -创建一个映射时,就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用:: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE -标志。如果指定了NORESERVE,那么这个函数立即返回,因为不需要预留。 - -参数'from'和'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射 -的长度。对于mmap(),offset参数可以用来指定进入底层文件的偏移量。在这种情况下,'from'和'to' -参数已经被这个偏移量所调整。 - -PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。 - -- 对于共享映射,预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时,预留映射不被 - 修改。 -- 对于私有映射,预留映射中没有条目表示相应页面存在预留。随着预留被消耗,条目被添加到预留映射中。 - 因此,预留映射也可用于确定哪些预留已被消耗。 - -对于私有映射,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外, -HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 - -预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射,这始终是一个值(to - from)。 -然而,对于共享映射来说,一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节, -请参见 :ref:`预留映射的修改 ` 一节。 - -该映射可能与一个子池(subpool)相关联。如果是这样,将查询子池以确保有足够的空间用于映射。子池 -有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 ` -一节。 - -在咨询了预留映射和子池之后,就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查 -并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而,在这 -些函数中,代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话,全局预留计数resv_huge_pages -会被调整,如下所示:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -注意,在检查和调整这些计数器时,全局锁hugetlb_lock会被预留。 - -如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以 -反映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于私有映射, -不对预留映射进行修改,因为没有条目表示存在预留。 - -如果hugetlb_reserve_pages()成功,全局预留数和与映射相关的预留映射将根据需要被修改,以确保 -在'from'-'to'范围内存在预留。 - -消耗预留/分配一个巨页 -=========================== - -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() -中进行的:: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 -预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 -外拷贝被分配。 - - -调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详 -细内容,请参见 :ref:`预留映射帮助函数 ` 一节。从 -vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留,则为0,如果不存在预留,则为1。 -如果不存在预留,并且有一个与映射相关联的子池,则查询子池以确定它是否包含预留。如果子池包含预留, -则可将其中一个用于该分配。然而,在任何情况下,avoid_reserve参数都会优先考虑为分配使用预留。在 -确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 -的参数: - -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 -- chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, - 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 - 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 - -与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面,当该页面从空闲列表中移除时, -free_huge_pages的值被递减。如果有一个与该页相关的预留,将进行以下调整:: - - SetPagePrivate(page); /* 表示分配这个页面消耗了一个预留, - * 如果遇到错误,以至于必须释放这个页面,预留将被 - * 恢复。 */ - resv_huge_pages--; /* 减少全局预留计数 */ - -注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 -的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: -SetPagePrivate(page) 和 resv_huge_pages--. - -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 -面被释放时,这将被用于子池的计数。 - -然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 -到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 -已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建一 -个新的条目。 - -注意,如果找不到满足VMA内存策略的巨页,将尝试使用伙伴分配器分配一个。这就带来了超出预留范围 -的剩余巨页和过度分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整。 -SetPagePrivate(page)和resv_huge_pages-。 - -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 -面被释放时,这将被用于子池的计数。 - -然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 -到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射,预留映射中的条目 -已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 -一个新的条目。 - -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 -vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 -享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 -这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 -识别。如果检测到这种竞争,子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息,请 -参见 :ref:`预留映射帮助函数 ` 一节。 - - -实例化巨页 -========== - -在巨页分配之后,页面通常被添加到分配任务的页表中。在此之前,共享映射中的页面被添加到页面缓 -存中,私有映射中的页面被添加到匿名反向映射中。在这两种情况下,PagePrivate标志被清除。因此, -当一个已经实例化的巨页被释放时,不会对全局预留计数(resv_huge_pages)进行调整。 - - -释放巨页 -======== - -巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 -递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 -留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 - -page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置,它表明全局预留计数 -应该被调整(关于如何设置这些标志的信息,请参见 -:ref: `消耗预留/分配一个巨页 ` )。 - - -该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值(不等于 -传递的1的值),它表明预留与子池相关联,这个新释放的页面必须被用来保持子池预留的数量超过最小值。 -因此,在这种情况下,全局resv_huge_pages计数器被递增。 - -如果页面中设置了PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。 - -子池预留 -======== - -有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一 -个hstate中的页面子集,它与一个已挂载的hugetlbfs文件系统相关 - -当一个hugetlbfs文件系统被挂载时,可以指定min_size选项,它表示文件系统所需的最小的巨页数量。 -如果指定了这个选项,与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体 -hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages) -被调用以预留指定数量的巨页。如果它们不能被预留,挂载就会失败。 - -当从子池中获取或释放页面时,会调用hugepage_subpool_get/put_pages()函数。 -hugepage_subpool_get/put_pages被传递给巨页数量,以此来调整子池的 “已用页面” 计数 -(get为下降,put为上升)。通常情况下,如果子池中没有足够的页面,它们会返回与传递的相同的值或 -一个错误。 - -然而,如果预留与子池相关联,可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局 -池调整的数量。例如,假设一个子池包含3个预留的巨页,有人要求5个。与子池相关的3个预留页可以用来 -满足部分请求。但是,必须从全局池中获得2个页面。为了向调用者转达这一信息,将返回值2。然后,调用 -者要负责从全局池中获取另外两个页面。 - - -COW和预留 -========== - -由于共享映射都指向并使用相同的底层页面,COW最大的预留问题是私有映射。在这种情况下,两个任务可 -以指向同一个先前分配的页面。一个任务试图写到该页,所以必须分配一个新的页,以便每个任务都指向它 -自己的页。 - -当该页最初被分配时,该页的预留被消耗了。当由于COW而试图分配一个新的页面时,有可能没有空闲的巨 -页,分配会失败。 - -当最初创建私有映射时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。 -由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常发生并且没有可用的页面 -时,对预留的所有者和非所有者采取不同的行动。 - -在发生异常的任务不是所有者的情况下,异常将失败,该任务通常会收到一个SIGBUS。 - -如果所有者是发生异常的任务,我们希望它能够成功,因为它拥有原始的预留。为了达到这个目的,该页被 -从非所有者任务中解映射出来。这样一来,唯一的引用就是来自拥有者的任务。此外,HPAGE_RESV_UNMAPPED -位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常,它可能 -会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。 - -预留映射的修改 -============== - -以下低级函数用于对预留映射进行修改。通常情况下,这些函数不会被直接调用。而是调用一个预留映射辅 -助函数,该函数调用这些低级函数中的一个。这些低级函数在源代码(mm/hugetlb.c)中得到了相当好的 -记录。这些函数是:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -在预留映射上的操作通常涉及两个操作: - -1) region_chg()被调用来检查预留映射,并确定在指定的范围[f, t]内有多少页目前没有被代表。 - - 调用代码执行全局检查和分配,以确定是否有足够的巨页使操作成功。 - -2) - a) 如果操作能够成功,regi_add()将被调用,以实际修改先前传递给regi_chg()的相同范围 - [f, t]的预留映射。 - b) 如果操作不能成功,region_abort被调用,在相同的范围[f, t]内中止操作。 - -注意,这是一个两步的过程, region_add()和 region_abort()在事先调用 region_chg()后保证 -成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作(特别是 region_add())的 -成功。 - -如上所述,region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加 -到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相 -同。然而,在共享映射的情况下,有可能在调用 region_chg() 和 region_add() 之间对预留映射进 -行更改。在这种情况下,regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下,全局计数 -和子池计数很可能是不正确的,需要调整。检查这种情况并进行适当的调整是调用者的责任。 - -函数region_del()被调用以从预留映射中移除区域。 -它通常在以下情况下被调用: - -- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射 - 之前,所有单独的file_region结构体必须被释放。在这种情况下,region_del的范围是[0, LONG_MAX]。 -- 当一个hugetlbfs文件正在被截断时。在这种情况下,所有在新文件大小之后分配的页面必须被释放。 - 此外,预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下,region_del - 的范围是[new_end_of_file, LONG_MAX]。 -- 当在一个hugetlbfs文件中打洞时。在这种情况下,巨页被一次次从文件的中间移除。当这些页被移除 - 时,region_del()被调用以从预留映射中移除相应的条目。在这种情况下,region_del被传递的范 - 围是[page_idx, page_idx + 1]。 - -在任何情况下,region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下,region_del() -会失败。这只能发生在打洞的情况下,即它必须分割一个现有的file_region条目,而不能分配一个新的 -结构体。在这种错误情况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有 -预留。然而,子池和全局预留计数将不反映该预留。为了处理这种情况,调用函数hugetlb_fix_reserve_counts() -来调整计数器,使其与不能被删除的预留映射条目相对应。 - -region_count()在解除私有巨页映射时被调用。在私有映射中,预留映射中没有条目表明存在一个预留。 -因此,通过计算预留映射中的条目数,我们知道有多少预留被消耗了,有多少预留是未完成的 -(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消 -失,子池和全局预留计数被未完成的预留数量所减去。 - -预留映射帮助函数 -================ - -有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣,所以它们只是传入一个 -地址而不是一个范围。此外,它们还传入相关的VMA。从VMA中,可以确定映射的类型(私有或共享)和预留 -映射的位置(inode或VMA)。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而, -它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义,并向调用者隐藏了这个细节:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -该函数为指定的页面调用 region_chg()。如果不存在预留,则返回1。如果存在预留,则返回0:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这将调用 region_add(),用于指定的页面。与region_chg和region_add的情况一样,该函数应在 -先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加,它将 -返回1,如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出 -现意外的差异,说明在两次调用之间修改了预留映射:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样,该函数应在 -先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -这是一个特殊的包装函数,有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数 -中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到 -了私有和共享映射的不同预留映射语义。因此,region_add被调用用于共享映射(因为映射中的条目表 -示预留),而region_del被调用用于私有映射(因为映射中没有条目表示预留)。关于在错误路径上需 -要做什么的更多信息,请参见 “错误路径中的预留清理” 。 - - -错误路径中的预留清理 -==================== - -正如在:ref:`预留映射帮助函数` 一节中提到的,预留的修改分两步进行。首 -先,在分配页面之前调用vma_needs_reservation。如果分配成功,则调用vma_commit_reservation。 -如果不是,则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整, -一切都很好。 - -此外,在一个巨页被实例化后,PagePrivate标志被清空,这样,当页面最终被释放时,计数是 -正确的。 - -然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, -页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 -(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 -显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, -并阻止分配一个预先分配的页面。 - -函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的 -是将预留映射恢复到页面分配前的状态。通过这种方式,预留映射的状态将与页面释放后的全局预留计 -数相对应。 - -函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下, -它将简单地清除该页的PagePrivate标志。这样一来,当页面被释放时,全局预留计数将不会被递增。 -然而,预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址,但它不会像最 -初设想的那样使用一个预留页。 - -有一些代码(最明显的是userfaultfd)不能调用restore_reserve_on_error。在这种情况下, -它简单地修改了PagePrivate,以便在释放巨页时不会泄露预留。 - - -预留和内存策略 -============== -当git第一次被用来管理Linux代码时,每个节点的巨页列表就存在于hstate结构中。预留的概念是 -在一段时间后加入的。当预留被添加时,没有尝试将内存策略考虑在内。虽然cpusets与内存策略不 -完全相同,但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作 -用:: - - - /* - * 当cpuset被配置时,它打破了严格的hugetlb页面预留,因为计数是在一个全局变量上完 - * 成的。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根据当前cpuset的 - * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时,应用程序仍然有可能 - * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的(或者说太难看了),因 - * 为cpuset太不稳定了,任务或内存节点可以在cpuset之间动态移动。与cpuset共享 - * hugetlb映射的语义变化是不可取的。然而,为了预留一些语义,我们退回到检查当前空闲 - * 页的可用性,作为一种最好的尝试,希望能将cpuset改变语义的影响降到最低。 - */ - -添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败(OOM)。然而,如果一个应用 -程序使用cpusets或内存策略,就不能保证在所需的节点上有巨页可用。即使有足够数量的全局 -预留,也是如此。 - -Hugetlbfs回归测试 -================= - -最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码,请使用 -libhugetlbfs测试套件来检查回归情况。此外,如果你添加了任何新的hugetlb功能,请在 -libhugetlbfs中添加适当的测试。 - --- -Mike Kravetz,2017年4月7日 diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/vm/hwpoison.rst deleted file mode 100644 index c6e1e7bdb05b..000000000000 --- a/Documentation/translations/zh_CN/vm/hwpoison.rst +++ /dev/null @@ -1,166 +0,0 @@ - -:Original: Documentation/vm/hwpoison.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -hwpoison -======== - -什么是hwpoison? -=============== - - -即将推出的英特尔CPU支持从一些内存错误中恢复( ``MCA恢复`` )。这需要操作系统宣布 -一个页面"poisoned",杀死与之相关的进程,并避免在未来使用它。 - -这个补丁包在虚拟机中实现了必要的(编程)框架。 - -引用概述中的评论:: - - 高级机器的检查与处理。处理方法是损坏的页面被硬件报告,通常是由于2位ECC内 - 存或高速缓存故障。 - - 这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时,当前运行的进程 - 可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理,就可 - 以安全地忽略它. 而不是用另外一个机器检查去处理它。 - - 处理不同状态的页面缓存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们可以异 - 步访问任何页面。因为内存故障可能随时随地发生,可能违反了他们的一些假设。这就是 - 为什么这段代码必须非常小心。一般来说,它试图使用正常的锁规则,如获得标准锁,即使 - 这意味着错误处理可能需要很长的时间。 - - 这里的一些操作有点低效,并且具有非线性的算法复杂性,因为数据结构没有针对这种情 - 况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的,所 - 以我们希望我们可以摆脱这种情况。 - -该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的 -各种检查组成,用来处理poison的页面。 - -现在主要目标是KVM客户机,但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm -版本。 - -对于KVM的使用,需要一个新的信号类型,这样KVM就可以用适当的地址将机器检查注入到客户 -机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是,所有的应用程序都不要这 -样做,但一些非常专业的应用程序可能会这样做。 - -故障恢复模式 -============ - -有两种(实际上是三种)模式的内存故障恢复可以在。 - -vm.memory_failure_recovery sysctl 置零: - 所有的内存故障都会导致panic。请不要尝试恢复。 - -早期处理 - (可以在全局和每个进程中控制) 一旦检测到错误,立即向应用程序发送SIGBUS这允许 - 应用程序以温和的方式处理内存错误(例如,放弃受影响的对象) 这是KVM qemu使用的 - 模式。 - -推迟处理 - 当应用程序运行到损坏的页面时,发送SIGBUS。这对不知道内存错误的应用程序来说是 - 最好的,默认情况下注意一些页面总是被当作late kill处理。 - -用户控制 -======== - -vm.memory_failure_recovery - 参阅 sysctl.txt - -vm.memory_failure_early_kill - 全局启用early kill - -PR_MCE_KILL - 设置early/late kill mode/revert 到系统默认值。 - - arg1: PR_MCE_KILL_CLEAR: - 恢复到系统默认值 - arg1: PR_MCE_KILL_SET: - arg2定义了线程特定模式 - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - 使用系统全局默认值 - - 注意,如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO),你应该在 - 指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则,SIGBUS将被发送到主线程。 - -PR_MCE_KILL_GET - 返回当前模式 - -测试 -==== - -* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面 - -* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块 - - corrupt-pfn - 在PFN处注入hwpoison故障,并echoed到这个文件。这做了一些早期过滤,以避 - 免在测试套件中损坏非预期页面。 - unpoison-pfn - 在PFN的Software-unpoison页面对应到这个文件。这样,一个页面可以再次被 - 复用。这只对Linux注入的故障起作用,对真正的内存故障不起作用。 - - 注意这些注入接口并不稳定,可能会在不同的内核版本中发生变化 - - corrupt-filter-dev-major, corrupt-filter-dev-minor - 只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通 - 配符值。这应该只用于人工注入的测试。 - - corrupt-filter-memcg - 限制注入到memgroup拥有的页面。由memcg的inode号指定。 - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - 当指定时,只有在((page_flags & mask) == value)的情况下才会poison页面。 - 这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相 - 同。这些标志位在include/linux/kernel-page-flags.h中定义,并在 - Documentation/admin-guide/mm/pagemap.rst中记录。 - -* 架构特定的MCE注入器 - - x86 有 mce-inject, mce-test - - 在mce-test中的一些便携式hwpoison测试程序,见下文。 - -引用 -==== - -http://halobates.de/mce-lc09-2.pdf - 09年LinuxCon的概述演讲 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - 测试套件(在tsrc中的hwpoison特定可移植测试)。 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86特定的注入器 - - -限制 -==== -- 不是所有的页面类型都被支持,而且永远不会。大多数内核内部对象不能被恢 - 复,目前只有LRU页。 - ---- -Andi Kleen, 2009年10月 diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/vm/index.rst deleted file mode 100644 index a1c6d529b6ff..000000000000 --- a/Documentation/translations/zh_CN/vm/index.rst +++ /dev/null @@ -1,54 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/index.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -================= -Linux内存管理文档 -================= - -这是一个关于Linux内存管理(mm)子系统内部的文档集,其中有不同层次的细节,包括注释 -和邮件列表的回复,用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建 -议,请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。 -对于控制和调整指南,请参阅(Documentation/admin-guide/mm/index)。 -TODO:待引用文档集被翻译完毕后请及时修改此处) - -.. toctree:: - :maxdepth: 1 - - active_mm - balance - damon/index - free_page_reporting - highmem - ksm - frontswap - hmm - hwpoison - hugetlbfs_reserv - memory-model - mmu_notifier - numa - overcommit-accounting - page_frags - page_owner - page_table_check - remap_file_pages - split_page_table_lock - z3fold - zsmalloc - -TODOLIST: -* arch_pgtable_helpers -* free_page_reporting -* hugetlbfs_reserv -* page_migration -* slub -* transhuge -* unevictable-lru -* vmalloced-kernel-stacks diff --git a/Documentation/translations/zh_CN/vm/ksm.rst b/Documentation/translations/zh_CN/vm/ksm.rst deleted file mode 100644 index 83b0c73984da..000000000000 --- a/Documentation/translations/zh_CN/vm/ksm.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. include:: ../disclaimer-zh_CN.rst - -:Original: Documentation/vm/ksm.rst - -:翻译: - - 徐鑫 xu xin - -============ -内核同页合并 -============ - -KSM 是一种节省内存的数据去重功能,由CONFIG_KSM=y启用,并在2.6.32版本时被添加 -到Linux内核。详见 ``mm/ksm.c`` 的实现,以及http://lwn.net/Articles/306704和 -https://lwn.net/Articles/330589 - -KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst -文档中有描述。 - -设计 -==== - -概述 ----- - -概述内容请见mm/ksm.c文档中的“DOC: Overview” - -逆映射 ------- -KSM维护着稳定树中的KSM页的逆映射信息。 - -当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时,则代表了 -KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时,这个KSM页 -的 ``page->mapping`` 指向了该稳定树节点。 - -如果共享数超过了阈值,KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多 -个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息,其中 ``page->mapping`` -指向该"副本"。 - -每个链以及链接到该链中的所有"副本"强制不变的是,它们代表了相同的写保护内存 -内容,尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。 - -这样一来,相比与无限的逆映射链表,稳定树的查找计算复杂性不受影响。但在稳定树 -本身中不能有重复的KSM页面内容仍然是强制要求。 - -由 ``max_page_sharing`` 强制决定的数据去重限制是必要的,以此来避免虚拟内存 -rmap链表变得过大。rmap的遍历具有O(N)的复杂度,其中N是共享页面的rmap_项(即 -虚拟映射)的数量,而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。 -因此,这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进 -程在稳定节点"链"上的遍历也是O(N),但这个N是稳定树"副本"的数量,而不是rmap项 -的数量,因此它对ksmd性能没有显著影响。实际上,最佳稳定树"副本"的候选节点将 -保留在"副本"列表的开头。 - -``max_page_sharing`` 的值设置得高了会促使更快的内存合并(因为将有更少的稳定 -树副本排队进入稳定节点chain->hlist)和更高的数据去重系数,但代价是在交换、压 -缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。 - -``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控 -的影响,高比值可能意味着稳定节点dup中存在碎片,这可以通过在ksmd中引入碎片算 -法来解决,该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup,以便释放 -那些仅包含极少rmap项的稳定节点"dup",但这可能会增加ksmd进程的CPU使用率,并可 -能会减慢应用程序在KSM页面上的只读计算。 - -KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本",以便删减过时了的稳定节点。 -这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。 - -参考 -==== -内核代码请见mm/ksm.c。 -涉及的函数(mm_slot ksm_scan stable_node rmap_item)。 diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/vm/memory-model.rst deleted file mode 100644 index 013e30c88d72..000000000000 --- a/Documentation/translations/zh_CN/vm/memory-model.rst +++ /dev/null @@ -1,135 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/memory-model.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -============ -物理内存模型 -============ - -系统中的物理内存可以用不同的方式进行寻址。最简单的情况是,物理内存从地址0开 -始,跨越一个连续的范围,直到最大的地址。然而,这个范围可能包含CPU无法访问的 -小孔隙。那么,在完全不同的地址可能有几个连续的范围。而且,别忘了NUMA,即不 -同的内存库连接到不同的CPU。 - -Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每 -个架构都定义了它所支持的内存模型,默认的内存模型是什么,以及是否有可能手动 -覆盖该默认值。 - -所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页 -帧的状态。 - -无论选择哪种内存模型,物理页框号(PFN)和相应的 `struct page` 之间都存 -在一对一的映射关系。 - -每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn` -帮助函数,允许从PFN到 `struct page` 的转换,反之亦然。 - -FLATMEM -======= - -最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的 -物理内存。 - -在FLATMEM内存模型中,有一个全局的 `mem_map` 数组来映射整个物理内存。对 -于大多数架构,孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page` -对象从未被完全初始化。 - -为了分配 `mem_map` 数组,架构特定的设置代码应该调用free_area_init()函数。 -然而,在调用memblock_free_all()函数之前,映射数组是不能使用的,该函数 -将所有的内存交给页分配器。 - -一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下,特 -定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。 - -使用FLATMEM,PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET` -是 `mem_map` 数组的一个索引。 - -`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。 - -SPARSEMEM -========= - -SPARSEMEM是Linux中最通用的内存模型,它是唯一支持若干高级功能的内存模型, -如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟 -初始化。 - -SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构 -体表示,它包含 `section_mem_map` ,从逻辑上讲,它是一个指向 `struct page` -阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管理。区段的大小 -和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量 -来指定的,这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS` -是一个架构所支持的物理地址的实际宽度,而 `SECTION_SIZE_BITS` 是一个任 -意的值。 - -最大的段数表示为 `NR_MEM_SECTIONS` ,定义为 - -.. math:: - - NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} - -`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的 -大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数: - -* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时, `mem_sections` 数组是静态的,有 - `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。 -* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时, `mem_sections` 数组被动态分配。 - 每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象,行数的计算是为了适应所有的 - 内存区。 - -架构设置代码应该调用sparse_init()来初始化内存区和内存映射。 - -通过SPARSEMEM,有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和 - "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的 - 值决定。 - -Classic sparse在page->flags中编码了一个页面的段号,并使用PFN的高位来访问映射该页 -框的段。在一个区段内,PFN是指向页数组的索引。 - -Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操 -作。有一个全局的 `struct page *vmemmap` 指针,指向一个虚拟连续的 `struct page` -对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` 的偏移量是该页的PFN。 - -为了使用vmemmap,一个架构必须保留一个虚拟地址的范围,以映射包含内存映射的物理页,并 -确保 `vmemmap`指向该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法, -它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求, -它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。 - -虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分 -配的存储中。这种存储用vmem_altmap结构表示,最终通过一长串的函数调用传递给 -vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和 -:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。 - -ZONE_DEVICE -=========== -`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物理地址范 -围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下 -事实有关:这些地址范围的页面对象从未被在线标记过,而且必须对设备进行引用,而不仅仅 -是页面,以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` , -为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`, -:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` 服务。由于页面引 -用计数永远不会低于1,所以页面永远不会被追踪为空闲内存,页面的 `struct list_head lru` -空间被重新利用,用于向映射该内存的主机设备/驱动程序进行反向引用。 - -虽然 `SPARSEMEM` 将内存作为一个区段的集合,可以选择收集并合成内存块,但 -`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE` -内存从未被在线标记,因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界 -上。这个实现依赖于这种缺乏用户接口的约束,允许子段大小的内存范围被指定给 -:c:func:`arch_add_memory` ,即内存热插拔的上半部分。子段支持允许2MB作为 -:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。 - -`ZONE_DEVICE` 的用户是: - -* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 - -* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , - 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见/vm/hmm.rst。 - -* p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 - 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/vm/mmu_notifier.rst deleted file mode 100644 index b29a37b33628..000000000000 --- a/Documentation/translations/zh_CN/vm/mmu_notifier.rst +++ /dev/null @@ -1,97 +0,0 @@ -:Original: Documentation/vm/mmu_notifier.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - - -什么时候需要页表锁内通知? -========================== - -当清除一个pte/pmd时,我们可以选择通过在页表锁下(通知版的\*_clear_flush调用 -mmu_notifier_invalidate_range)通知事件。但这种通知并不是在所有情况下都需要的。 - -对于二级TLB(非CPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让 -IOMMU走CPU页表来访问进程的虚拟地址空间)。只有两种情况需要在清除pte/pmd时在持有页 -表锁的同时通知这些二级TLB: - - A) 在mmu_notifier_invalidate_range_end()之前,支持页的地址被释放。 - B) 一个页表项被更新以指向一个新的页面(COW,零页上的写异常,__replace_page(),...)。 - -情况A很明显,你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。 - -情况B更加微妙。为了正确起见,它需要按照以下序列发生: - - - 上页表锁 - - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify()) - - 设置页表项以指向新页 - -如果在设置新的pte/pmd值之前,清除页表项之后没有进行通知,那么你就会破坏设备的C11或 -C++11等内存模型。 - -考虑以下情况(设备使用类似于ATS/PASID的功能)。 - -两个地址addrA和addrB,这样|addrA - addrB| >= PAGE_SIZE,我们假设它们是COW的 -写保护(B的其他情况也适用)。 - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {尝试写到addrA} - CPU-thread-1 {尝试写到addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {读取addrA并填充设备TLB} - DEV-thread-2 {读取addrB并填充设备TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {更新页表以指向addrA的新页}} - CPU-thread-1 {COW_step1: {更新页表以指向addrB的新页}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {写入addrA,这是对新页面的写入} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {写入addrB,这是一个写入新页的过程} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {从旧页中读取addrA} - DEV-thread-2 {从新页面读取addrB} - -所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之前 -就看到了addrB的新值。这就破坏了设备的总内存序。 - -当改变一个pte的写保护或指向一个新的具有相同内容的写保护页(KSM)时,将mmu_notifier_invalidate_range -调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程 -在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占,也是如此。 diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/vm/numa.rst deleted file mode 100644 index 6af412b924ad..000000000000 --- a/Documentation/translations/zh_CN/vm/numa.rst +++ /dev/null @@ -1,101 +0,0 @@ -:Original: Documentation/vm/numa.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -始于1999年11月,作者: - -========================== -何为非统一内存访问(NUMA)? -========================== - -这个问题可以从几个视角来回答:硬件观点和Linux软件视角。 - -从硬件角度看,NUMA系统是一个由多个组件或装配组成的计算机平台,每个组件可能包含0个或更多的CPU、 -本地内存和/或IO总线。为了简洁起见,并将这些物理组件/装配的硬件视角与软件抽象区分开来,我们在 -本文中称这些组件/装配为“单元”。 - -每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能 -不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如,交叉开关或点对点 -链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来,以创建NUMA平台,其中的单元与其 -他单元有多个距离。 - -对于Linux,感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中, -所有的内存都是可见的,并且可以从连接到任何单元的任何CPU中访问,缓存一致性是由处理器缓存和/或 -系统互连在硬件中处理。 - -内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元 -有多远。例如,连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和 -更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的(其他)单元。 - -平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反,这种架构是提供可扩展 -内存带宽的一种手段。然而,为了实现可扩展的内存带宽,系统和应用软件必须安排大部分的内存引用 -[cache misses]到“本地”内存——同一单元的内存,如果有的话——或者到最近的有内存的单元。 - -这就自然而然有了Linux软件对NUMA系统的视角: - -Linux将系统的硬件资源划分为多个软件抽象,称为“节点”。Linux将节点映射到硬件平台的物理单元 -上,对一些架构的细节进行了抽象。与物理单元一样,软件节点可能包含0或更多的CPU、内存和/或IO -总线。同样,对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快 -的访问时间和更高的有效带宽。 - -对于一些架构,如x86,Linux将“隐藏”任何代表没有内存连接的物理单元的节点,并将连接到该单元 -的任何CPU重新分配到代表有内存的单元的节点上。因此,在这些架构上,我们不能假设Linux将所有 -的CPU与一个给定的节点相关联,会看到相同的本地内存访问时间和带宽。 - -此外,对于某些架构,同样以x86为例,Linux支持对额外节点的仿真。对于NUMA仿真,Linux会将现 -有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部 -分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的,当与cpusets一起使用时, -可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst] - -对于每个有内存的节点,Linux构建了一个独立的内存管理子系统,有自己的空闲页列表、使用中页列表、 -使用统计和锁来调解访问。此外,Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE -中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求 -时要访问的区/节点。当一个区没有可用的内存来满足请求时,这种情况被称为“overflow 溢出”或 -“fallback 回退”。 - -由于一些节点包含多个包含不同类型内存的区,Linux必须决定是否对区列表进行排序,使分配回退到不同 -节点上的相同区类型,或同一节点上的不同区类型。这是一个重要的考虑因素,因为有些区,如DMA或DMA32, -代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距 -离排序的远程节点之前,它会尝试回退到同一节点的其他分区。 - -默认情况下,Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说,Linux将试 -图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能 -满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。 - -本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配 -一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构 -中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥 -远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分不平衡的情况下,任务可 -以在节点之间迁移,远离其初始节点和内核数据结构。 - -系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口,如taskset(1)和numactl(1),以及程 -序接口,如sched_setaffinity(2),来限制任务的迁移,以改善NUMA定位。此外,人们可以使用 -Linux NUMA内存策略修改内核的默认本地分配行为。 [见 -:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. - -系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点 -的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst] - -在不隐藏无内存节点的架构上,Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无 -内存的节点,“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反,它 -将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分配将由内核提供 -最近的可用内存来完成。这是同一机制的结果,该机制允许这种分配在一个包含内存的节点溢出时回退到 -其他附近的节点。 - -一些内核分配不希望或不能容忍这种分配回退行为。相反,他们想确保他们从指定的节点获得内存,或者 -得到通知说该节点没有空闲内存。例如,当一个子系统分配每个CPU的内存资源时,通常是这种情况。 - -一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的 -节点ID,然后只从返回的节点ID请求内存。当这样的分配失败时,请求的子系统可以恢复到它自己的回退 -路径。板块内核内存分配器就是这样的一个例子。或者,子系统可以选择在分配失败时禁用或不启用自己。 -内核分析子系统就是这样的一个例子。 - -如果架构支持——不隐藏无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些 -子系统如果试图完全从无内存的节点分配内存,将无法初始化。为了透明地支持这种架构,内核子系统可 -以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样,这是同 -一个节点,默认的本地页分配将从这个节点开始尝试。 diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst deleted file mode 100644 index 8765cb118f24..000000000000 --- a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst +++ /dev/null @@ -1,86 +0,0 @@ -:Original: Documentation/vm/overcommit-accounting.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - - -============== -超量使用审计 -============== - -Linux内核支持下列超量使用处理模式 - -0 - 启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。 - 它确保严重的疯狂分配失败,同时允许超量使用以减少swap的使用。在这种模式下, - 允许root分配稍多的内存。这是默认的。 -1 - 总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码,只是依赖 - 几乎完全由零页组成的虚拟内存 - -2 - 不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量 - (默认为50%)。根据你使用的数量,在大多数情况下,这意味着一个进程在访问页面时 - 不会被杀死,但会在内存分配上收到相应的错误。 - - 对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说 - 是很有用的。 - -超量使用策略是通过sysctl `vm.overcommit_memory` 设置的。 - -可以通过 `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (绝对值) -来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。 - -在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前 -的超量使用和提交量。 - -陷阱 -==== - -C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证,并在接近边缘的地方运行, -你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说,这并 -不重要,但如果你真的非常关心的话,这就是一个值得关注的案例。 - - -在模式2中,MAP_NORESERVE标志被忽略。 - - -它是如何工作的 -============== - -超量使用是基于以下规则 - -对于文件映射 - | SHARED or READ-only - 0 cost (该文件是映射而不是交换) - | PRIVATE WRITABLE - 每个实例的映射大小 - -对于匿名或者 ``/dev/zero`` 映射 - | SHARED - 映射的大小 - | PRIVATE READ-only - 0 cost (但作用不大) - | PRIVATE WRITABLE - 每个实例的映射大小 - -额外的计数 - | 通过mmap制作可写副本的页面 - | 从同一池中提取的shmfs内存 - -状态 -==== - -* 我们核算mmap内存映射 -* 我们核算mprotect在提交中的变化 -* 我们核算mremap的大小变化 -* 我们的审计 brk -* 审计munmap -* 我们在/proc中报告commit 状态 -* 核对并检查分叉的情况 -* 审查堆栈处理/执行中的构建 -* 叙述SHMfs的情况 -* 实现实际限制的执行 - -待续 -==== -* ptrace 页计数(这很难)。 diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/vm/page_frags.rst deleted file mode 100644 index ad27fed33634..000000000000 --- a/Documentation/translations/zh_CN/vm/page_frags.rst +++ /dev/null @@ -1,38 +0,0 @@ -:Original: Documentation/vm/page_frag.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -页面片段 -======== - -一个页面片段是一个任意长度的任意偏移的内存区域,它位于一个0或更高阶的复合页面中。 -该页中的多个碎片在该页的引用计数器中被单独计算。 - -page_frag函数,page_frag_alloc和page_frag_free,为页面片段提供了一个简单 -的分配框架。这被网络堆栈和网络设备驱动使用,以提供一个内存的支持区域,作为 -sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 - -为了使用页面片段API,需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点, -并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用, -这在分配时开销可能会很大。然而,由于这种缓存的性质,要求任何对缓存的调用都要受到每 -个CPU的限制,或者每个CPU的限制,并在执行碎片分配时强制禁止中断。 - -网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 -netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache -被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 -它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 -将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 - -许多网络设备驱动程序使用类似的方法来分配页面片段,但页面片段是在环或描述符级别上 -缓存的。为了实现这些情况,有必要提供一种拆解页面缓存的通用方法。出于这个原因, -__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。 -这样做的好处是,它允许清理被添加到一个页面的多个引用,以避免每次分配都调用 -get_page。 - -Alexander Duyck,2016年11月29日。 diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/vm/page_owner.rst deleted file mode 100644 index 9e951fabba9d..000000000000 --- a/Documentation/translations/zh_CN/vm/page_owner.rst +++ /dev/null @@ -1,116 +0,0 @@ -:Original: Documentation/vm/page_owner.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -================================ -page owner: 跟踪谁分配的每个页面 -================================ - -概述 -==== - -page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。 -当分配发生时,有关分配的信息,如调用堆栈和页面的顺序被存储到每个页面的特定存储中。 -当我们需要了解所有页面的状态时,我们可以获得并分析这些信息。 - -尽管我们已经有了追踪页面分配/释放的tracepoint,但用它来分析谁分配的每个页面是 -相当复杂的。我们需要扩大跟踪缓冲区,以防止在用户空间程序启动前出现重叠。而且,启 -动的程序会不断地将跟踪缓冲区转出,供以后分析,这将会改变系统的行为,会产生更多的 -可能性,而不是仅仅保留在内存中,所以不利于调试。 - -页面所有者也可以用于各种目的。例如,可以通过每个页面的gfp标志信息获得精确的碎片 -统计。如果启用了page owner,它就已经实现并激活了。我们非常欢迎其他用途。 - -page owner在默认情况下是禁用的。所以,如果你想使用它,你需要在你的启动cmdline -中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有启用启动 -选项而在运行时禁用page owner,那么运行时的开销是很小的。如果在运行时禁用,它不 -需要内存来存储所有者信息,所以没有运行时内存开销。而且,页面所有者在页面分配器的 -热路径中只插入了两个不可能的分支,如果不启用,那么分配就会像没有页面所有者的内核 -一样进行。这两个不可能的分支应该不会影响到分配的性能,特别是在静态键跳转标签修补 -功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。 - -- 没有page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- 有page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -虽然总共增加了8KB的代码,但page_alloc.o增加了520字节,其中不到一半是在hotpath -中。构建带有page owner的内核,并在需要时打开它,将是调试内核内存问题的最佳选择。 - -有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这 -个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些,所以,在初始化 -之前,许多页面可以被分配,但它们没有所有者信息。为了解决这个问题,这些早期分配的 -页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息,但至 -少,我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上,有13343 -个早期分配的页面被捕捉和标记,尽管它们大部分是由结构页扩展功能分配的。总之,在这 -之后,没有任何页面处于未追踪状态。 - -使用方法 -======== - -1) 构建用户空间的帮助:: - - cd tools/vm - make page_owner_sort - -2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. - -3) 做你想调试的工作。 - -4) 分析来自页面所有者的信息:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - - ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值):: - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexp提 - 取页序值,计算buf的次数和页数,最后根据参数进行排序。 - - 在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出:: - - XXX times, XXX pages: - Page allocated via order XXX, ... - // Detailed stack - - 默认情况下, ``page_owner_sort`` 是根据buf的时间来排序的。如果你想 - 按buf的页数排序,请使用-m参数。详细的参数是: - - 基本函数: - - Sort: - -a 按内存分配时间排序 - -m 按总内存排序 - -p 按pid排序。 - -P 按tgid排序。 - -r 按内存释放时间排序。 - -s 按堆栈跟踪排序。 - -t 按时间排序(默认)。 - - 其它函数: - - Cull: - -c 通过比较堆栈跟踪而不是总块来进行剔除。 - - Filter: - -f 过滤掉内存已被释放的块的信息。 diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/vm/page_table_check.rst deleted file mode 100644 index a29fc1b360e6..000000000000 --- a/Documentation/translations/zh_CN/vm/page_table_check.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -:Original: Documentation/vm/page_table_check.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -======== -页表检查 -======== - -概述 -==== - -页表检查允许通过确保防止某些类型的内存损坏来强化内核。 - -当新的页面可以从用户空间访问时,页表检查通过将它们的页表项(PTEs PMD等)添加到页表中来执行额外 -的验证。 - -在检测到损坏的情况下,内核会被崩溃。页表检查有一个小的性能和内存开销。因此,它在默认情况下是禁用 -的,但是在额外的加固超过性能成本的系统上,可以选择启用。另外,由于页表检查是同步的,它可以帮助调 -试双映射内存损坏问题,在错误的映射发生时崩溃内核,而不是在内存损坏错误发生后内核崩溃。 - -双重映射检测逻辑 -================ - -+-------------------+-------------------+-------------------+------------------+ -| Current Mapping | New mapping | Permissions | Rule | -+===================+===================+===================+==================+ -| Anonymous | Anonymous | Read | Allow | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Anonymous | Read / Write | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Named | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Anonymous | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Named | Any | Allow | -+-------------------+-------------------+-------------------+------------------+ - -启用页表检查 -============ - -用以下方法构建内核: - -- PAGE_TABLE_CHECK=y - 注意,它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。 - -- 使用 "page_table_check=on" 内核参数启动。 - -可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核,以便在没有额外的内核参数的情况下获得页表 -支持。 diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/vm/remap_file_pages.rst deleted file mode 100644 index af6b7e28af23..000000000000 --- a/Documentation/translations/zh_CN/vm/remap_file_pages.rst +++ /dev/null @@ -1,32 +0,0 @@ -:Original: Documentation/vm/remap_file_pages.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -============================== -remap_file_pages()系统调用 -============================== - -remap_file_pages()系统调用被用来创建一个非线性映射,也就是说,在这个映射中, -文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好 -处是,前者不需要内核创建额外的VMA(虚拟内存区)数据结构。 - -支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码,包括热 -路径。另外,为了使非线性映射工作,内核需要一种方法来区分正常的页表项和带有文件 -偏移的项(pte_file)。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资 -源,特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。 - -幸运的是,在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS -实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。 -由于64位系统的广泛使用,这种使用情况已经不重要了。 - -syscall被废弃了,现在用一个模拟来代替它。仿真会创建新的VMA,而不是非线性映射。 -对于remap_file_pages()的少数用户来说,它的工作速度会变慢,但ABI被保留了。 - -仿真的一个副作用(除了性能之外)是,由于额外的VMA,用户可以更容易达到 -vm.max_map_count的限制。关于限制的更多细节,请参见DEFAULT_MAX_MAP_COUNT -的注释。 diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst deleted file mode 100644 index 50694d97c426..000000000000 --- a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst +++ /dev/null @@ -1,96 +0,0 @@ -:Original: Documentation/vm/split_page_table_lock.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -================================= -分页表锁(split page table lock) -================================= - -最初,mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方 -法导致了多线程应用程序的缺页异常可扩展性差,因为对锁的争夺很激烈。为了提高可扩 -展性,我们引入了分页表锁。 - -有了分页表锁,我们就有了单独的每张表锁来顺序化对表的访问。目前,我们对PTE和 -PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。 - -有一些辅助工具来锁定/解锁一个表和其他访问器函数: - - - pte_offset_map_lock() - 映射pte并获取PTE表锁,返回所取锁的指针; - - pte_unmap_unlock() - 解锁和解映射PTE表; - - pte_alloc_map_lock() - 如果需要的话,分配PTE表并获取锁,如果分配失败,返回已获取的锁的指针 - 或NULL; - - pte_lockptr() - 返回指向PTE表锁的指针; - - pmd_lock() - 取得PMD表锁,返回所取锁的指针。 - - pmd_lockptr() - 返回指向PMD表锁的指针; - -如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)小于或等于NR_CPUS,则在编译 -时启用PTE表的分页表锁。如果分页锁被禁用,所有的表都由mm->page_table_lock -来保护。 - -如果PMD表启用了分页锁,并且架构支持它,那么PMD表的分页锁就会被启用(见 -下文)。 - -Hugetlb 和分页表锁 -================== - -Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁,但不对PUD使用。 - -Hugetlb特定的辅助函数: - - - huge_pte_lock() - 对PMD_SIZE页面采取pmd分割锁,否则mm->page_table_lock; - - huge_pte_lockptr() - 返回指向表锁的指针。 - -架构对分页表锁的支持 -==================== - -没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() -和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 - -确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 -面。这个区域与page->ptl共享存储。 - -PMD分页锁只有在你有两个以上的页表级别时才有意义。 - -启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 -用pgtable_pmd_page_dtor()。 - -分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() -中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 -分配一些PMD。 - -一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 - -注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 -须正确处理。 - -page->ptl -========= - -page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struct page。它 -与page->private(以及union中的其他几个字段)共享存储。 - -为了避免增加struct page的大小并获得最佳性能,我们使用了一个技巧: - - - 如果spinlock_t适合于long,我们使用page->ptr作为spinlock,这样我们 - 就可以避免间接访问并节省一个缓存行。 - - 如果spinlock_t的大小大于long的大小,我们使用page->ptl作为spinlock_t - 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 - 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 - -PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t -分配在pgtable_pmd_page_ctor()中。 - -请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/vm/z3fold.rst deleted file mode 100644 index 57204aa08caa..000000000000 --- a/Documentation/translations/zh_CN/vm/z3fold.rst +++ /dev/null @@ -1,31 +0,0 @@ -:Original: Documentation/vm/z3fold.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - - -====== -z3fold -====== - -z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。 -它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。 - -z3fold和zbud的主要区别是: - -* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。 -* z3fold在其页面中最多可以容纳3个压缩页面 -* z3fold本身没有输出任何API,因此打算通过zpool的API来使用 - -为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是 -它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩 -率是1.7倍左右。 - -不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它 -返回一个无符号长句柄,它编码了被分配对象的实际位置。 - -保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行 -为,这使得它更适合于小型和反应迅速的系统。 diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/vm/zsmalloc.rst deleted file mode 100644 index 29e9c70a8eb6..000000000000 --- a/Documentation/translations/zh_CN/vm/zsmalloc.rst +++ /dev/null @@ -1,78 +0,0 @@ -:Original: Documentation/vm/zs_malloc.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -======== -zsmalloc -======== - -这个分配器是为与zram一起使用而设计的。因此,该分配器应该在低内存条件下工作良好。特别是, -它从未尝试过higher order页面的分配,这在内存压力下很可能会失败。另一方面,如果我们只 -是使用单(0-order)页,它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将 -占据整个页面。这是其前身(xvmalloc)的主要问题之一。 - -为了克服这些问题,zsmalloc分配了一堆0-order页面,并使用各种"struct page"字段将它 -们链接起来。这些链接的页面作为一个单一的higher order页面,即一个对象可以跨越0-order -页面的边界。代码将这些链接的页面作为一个实体,称为zspage。 - -为了简单起见,zsmalloc只能分配大小不超过PAGE_SIZE的对象,因为这满足了所有当前用户的 -要求(在最坏的情况下,页面是不可压缩的,因此以"原样"即未压缩的形式存储)。对于大于这 -个大小的分配请求,会返回失败(见zs_malloc)。 - -此外,zs_malloc()并不返回一个可重复引用的指针。相反,它返回一个不透明的句柄(无符号 -长),它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久 -映射,因为这在32位系统上会导致问题,因为内核空间映射的VA区域非常小。因此,在使用分配 -的内存之前,对象必须使用zs_map_object()进行映射以获得一个可用的指针,随后使用 -zs_unmap_object()解除映射。 - -stat -==== - -通过CONFIG_ZSMALLOC_STAT,我们可以通过 ``/sys/kernel/debug/zsmalloc/`` -看到zsmalloc内部信息。下面是一个统计输出的例子。:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - 索引 -size - zspage存储对象大小 -almost_empty - ZS_ALMOST_EMPTY zspage的数量(见下文)。 -almost_full - ZS_ALMOST_FULL zspage的数量(见下图) -obj_allocated - 已分配对象的数量 -obj_used - 分配给用户的对象的数量 -pages_used - 为该类分配的页数 -pages_per_zspage - 组成一个zspage的0-order页面的数量 - -当n <= N / f时,我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组,其中 - -* n = 已分配对象的数量 -* N = zspage可以存储的对象总数 -* f = fullness_threshold_frac(即,目前是4个) - -同样地,我们将zspage分配给: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/Documentation/translations/zh_TW/index.rst b/Documentation/translations/zh_TW/index.rst index e1ce9d8c06f8..e97d7d578751 100644 --- a/Documentation/translations/zh_TW/index.rst +++ b/Documentation/translations/zh_TW/index.rst @@ -128,7 +128,7 @@ TODOList: * security/index * sound/index * crypto/index -* vm/index +* mm/index * bpf/index * usb/index * PCI/index diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore deleted file mode 100644 index bc74f5643008..000000000000 --- a/Documentation/vm/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -page-types -slabinfo diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst deleted file mode 100644 index 6f8269c284ed..000000000000 --- a/Documentation/vm/active_mm.rst +++ /dev/null @@ -1,91 +0,0 @@ -.. _active_mm: - -========= -Active MM -========= - -:: - - List: linux-kernel - Subject: Re: active_mm - From: Linus Torvalds - Date: 1999-07-30 21:36:24 - - Cc'd to linux-kernel, because I don't write explanations all that often, - and when I do I feel better about more people reading them. - - On Fri, 30 Jul 1999, David Mosberger wrote: - > - > Is there a brief description someplace on how "mm" vs. "active_mm" in - > the task_struct are supposed to be used? (My apologies if this was - > discussed on the mailing lists---I just returned from vacation and - > wasn't able to follow linux-kernel for a while). - - Basically, the new setup is: - - - we have "real address spaces" and "anonymous address spaces". The - difference is that an anonymous address space doesn't care about the - user-level page tables at all, so when we do a context switch into an - anonymous address space we just leave the previous address space - active. - - The obvious use for a "anonymous address space" is any thread that - doesn't need any user mappings - all kernel threads basically fall into - this category, but even "real" threads can temporarily say that for - some amount of time they are not going to be interested in user space, - and that the scheduler might as well try to avoid wasting time on - switching the VM state around. Currently only the old-style bdflush - sync does that. - - - "tsk->mm" points to the "real address space". For an anonymous process, - tsk->mm will be NULL, for the logical reason that an anonymous process - really doesn't _have_ a real address space at all. - - - however, we obviously need to keep track of which address space we - "stole" for such an anonymous user. For that, we have "tsk->active_mm", - which shows what the currently active address space is. - - The rule is that for a process with a real address space (ie tsk->mm is - non-NULL) the active_mm obviously always has to be the same as the real - one. - - For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the - "borrowed" mm while the anonymous process is running. When the - anonymous process gets scheduled away, the borrowed address space is - returned and cleared. - - To support all that, the "struct mm_struct" now has two counters: a - "mm_users" counter that is how many "real address space users" there are, - and a "mm_count" counter that is the number of "lazy" users (ie anonymous - users) plus one if there are any real users. - - Usually there is at least one real user, but it could be that the real - user exited on another CPU while a lazy user was still active, so you do - actually get cases where you have a address space that is _only_ used by - lazy users. That is often a short-lived state, because once that thread - gets scheduled away in favour of a real thread, the "zombie" mm gets - released because "mm_count" becomes zero. - - Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any - more. "init_mm" should be considered just a "lazy context when no other - context is available", and in fact it is mainly used just at bootup when - no real VM has yet been created. So code that used to check - - if (current->mm == &init_mm) - - should generally just do - - if (!current->mm) - - instead (which makes more sense anyway - the test is basically one of "do - we have a user context", and is generally done by the page fault handler - and things like that). - - Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, - because it slightly changes the interfaces to accommodate the alpha (who - would have thought it, but the alpha actually ends up having one of the - ugliest context switch codes - unlike the other architectures where the MM - and register state is separate, the alpha PALcode joins the two, and you - need to switch both together). - - (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst deleted file mode 100644 index cbaee9e59241..000000000000 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ /dev/null @@ -1,260 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _arch_page_table_helpers: - -=============================== -Architecture Page Table Helpers -=============================== - -Generic MM expects architectures (with MMU) to provide helpers to create, access -and modify page table entries at various level for different memory functions. -These page table helpers need to conform to a common semantics across platforms. -Following tables describe the expected semantics which can also be tested during -boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug -test need to be in sync. - - -PTE Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pte_same | Tests whether both PTE entries are the same | -+---------------------------+--------------------------------------------------+ -| pte_bad | Tests a non-table mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_present | Tests a valid mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_young | Tests a young PTE | -+---------------------------+--------------------------------------------------+ -| pte_dirty | Tests a dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_write | Tests a writable PTE | -+---------------------------+--------------------------------------------------+ -| pte_special | Tests a special PTE | -+---------------------------+--------------------------------------------------+ -| pte_protnone | Tests a PROT_NONE PTE | -+---------------------------+--------------------------------------------------+ -| pte_devmap | Tests a ZONE_DEVICE mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_soft_dirty | Tests a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_soft_dirty | Tests a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkyoung | Creates a young PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkold | Creates an old PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkdirty | Creates a dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkclean | Creates a clean PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkwrite | Creates a writable PTE | -+---------------------------+--------------------------------------------------+ -| pte_wrprotect | Creates a write protected PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkspecial | Creates a special PTE | -+---------------------------+--------------------------------------------------+ -| pte_mkdevmap | Creates a ZONE_DEVICE mapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mksoft_dirty | Creates a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_clear_soft_dirty | Clears a soft dirty PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_mksoft_dirty | Creates a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_swp_clear_soft_dirty | Clears a soft dirty swapped PTE | -+---------------------------+--------------------------------------------------+ -| pte_mknotpresent | Invalidates a mapped PTE | -+---------------------------+--------------------------------------------------+ -| ptep_clear | Clears a PTE | -+---------------------------+--------------------------------------------------+ -| ptep_get_and_clear | Clears and returns PTE | -+---------------------------+--------------------------------------------------+ -| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | -+---------------------------+--------------------------------------------------+ -| ptep_test_and_clear_young | Clears young from a PTE | -+---------------------------+--------------------------------------------------+ -| ptep_set_wrprotect | Converts into a write protected PTE | -+---------------------------+--------------------------------------------------+ -| ptep_set_access_flags | Converts into a more permissive PTE | -+---------------------------+--------------------------------------------------+ - - -PMD Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pmd_same | Tests whether both PMD entries are the same | -+---------------------------+--------------------------------------------------+ -| pmd_bad | Tests a non-table mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_leaf | Tests a leaf mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_huge | Tests a HugeTLB mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | -+---------------------------+--------------------------------------------------+ -| pmd_present | Tests a valid mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_young | Tests a young PMD | -+---------------------------+--------------------------------------------------+ -| pmd_dirty | Tests a dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_write | Tests a writable PMD | -+---------------------------+--------------------------------------------------+ -| pmd_special | Tests a special PMD | -+---------------------------+--------------------------------------------------+ -| pmd_protnone | Tests a PROT_NONE PMD | -+---------------------------+--------------------------------------------------+ -| pmd_devmap | Tests a ZONE_DEVICE mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_soft_dirty | Tests a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_soft_dirty | Tests a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkyoung | Creates a young PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkold | Creates an old PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkdirty | Creates a dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkclean | Creates a clean PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkwrite | Creates a writable PMD | -+---------------------------+--------------------------------------------------+ -| pmd_wrprotect | Creates a write protected PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkspecial | Creates a special PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkdevmap | Creates a ZONE_DEVICE mapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mksoft_dirty | Creates a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_clear_soft_dirty | Clears a soft dirty PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_mksoft_dirty | Creates a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD | -+---------------------------+--------------------------------------------------+ -| pmd_mkinvalid | Invalidates a mapped PMD [1] | -+---------------------------+--------------------------------------------------+ -| pmd_set_huge | Creates a PMD huge mapping | -+---------------------------+--------------------------------------------------+ -| pmd_clear_huge | Clears a PMD huge mapping | -+---------------------------+--------------------------------------------------+ -| pmdp_get_and_clear | Clears a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_get_and_clear_full | Clears a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_test_and_clear_young | Clears young from a PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_set_wrprotect | Converts into a write protected PMD | -+---------------------------+--------------------------------------------------+ -| pmdp_set_access_flags | Converts into a more permissive PMD | -+---------------------------+--------------------------------------------------+ - - -PUD Page Table Helpers -====================== - -+---------------------------+--------------------------------------------------+ -| pud_same | Tests whether both PUD entries are the same | -+---------------------------+--------------------------------------------------+ -| pud_bad | Tests a non-table mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_leaf | Tests a leaf mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_huge | Tests a HugeTLB mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_trans_huge | Tests a Transparent Huge Page (THP) at PUD | -+---------------------------+--------------------------------------------------+ -| pud_present | Tests a valid mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_young | Tests a young PUD | -+---------------------------+--------------------------------------------------+ -| pud_dirty | Tests a dirty PUD | -+---------------------------+--------------------------------------------------+ -| pud_write | Tests a writable PUD | -+---------------------------+--------------------------------------------------+ -| pud_devmap | Tests a ZONE_DEVICE mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkyoung | Creates a young PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkold | Creates an old PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkdirty | Creates a dirty PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkclean | Creates a clean PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkwrite | Creates a writable PUD | -+---------------------------+--------------------------------------------------+ -| pud_wrprotect | Creates a write protected PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | -+---------------------------+--------------------------------------------------+ -| pud_mkinvalid | Invalidates a mapped PUD [1] | -+---------------------------+--------------------------------------------------+ -| pud_set_huge | Creates a PUD huge mapping | -+---------------------------+--------------------------------------------------+ -| pud_clear_huge | Clears a PUD huge mapping | -+---------------------------+--------------------------------------------------+ -| pudp_get_and_clear | Clears a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_get_and_clear_full | Clears a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_test_and_clear_young | Clears young from a PUD | -+---------------------------+--------------------------------------------------+ -| pudp_set_wrprotect | Converts into a write protected PUD | -+---------------------------+--------------------------------------------------+ -| pudp_set_access_flags | Converts into a more permissive PUD | -+---------------------------+--------------------------------------------------+ - - -HugeTLB Page Table Helpers -========================== - -+---------------------------+--------------------------------------------------+ -| pte_huge | Tests a HugeTLB | -+---------------------------+--------------------------------------------------+ -| pte_mkhuge | Creates a HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_dirty | Tests a dirty HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_write | Tests a writable HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_mkdirty | Creates a dirty HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_mkwrite | Creates a writable HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_pte_wrprotect | Creates a write protected HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_get_and_clear | Clears a HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_set_wrprotect | Converts into a write protected HugeTLB | -+---------------------------+--------------------------------------------------+ -| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB | -+---------------------------+--------------------------------------------------+ - - -SWAP Page Table Helpers -======================== - -+---------------------------+--------------------------------------------------+ -| __pte_to_swp_entry | Creates a swapped entry (arch) from a mapped PTE | -+---------------------------+--------------------------------------------------+ -| __swp_to_pte_entry | Creates a mapped PTE from a swapped entry (arch) | -+---------------------------+--------------------------------------------------+ -| __pmd_to_swp_entry | Creates a swapped entry (arch) from a mapped PMD | -+---------------------------+--------------------------------------------------+ -| __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | -+---------------------------+--------------------------------------------------+ -| is_migration_entry | Tests a migration (read or write) swapped entry | -+-------------------------------+----------------------------------------------+ -| is_writable_migration_entry | Tests a write migration swapped entry | -+-------------------------------+----------------------------------------------+ -| make_readable_migration_entry | Creates a read migration swapped entry | -+-------------------------------+----------------------------------------------+ -| make_writable_migration_entry | Creates a write migration swapped entry | -+-------------------------------+----------------------------------------------+ - -[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst deleted file mode 100644 index 6a1fadf3e173..000000000000 --- a/Documentation/vm/balance.rst +++ /dev/null @@ -1,102 +0,0 @@ -.. _balance: - -================ -Memory Balancing -================ - -Started Jan 2000 by Kanoj Sarcar - -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as -well as for non __GFP_IO allocations. - -The first reason why a caller may avoid reclaim is that the caller can not -sleep due to holding a spinlock or is in interrupt context. The second may -be that the caller is willing to fail the allocation without incurring the -overhead of page reclaim. This may happen for opportunistic high-order -allocation requests that have order-0 fallback options. In such cases, -the caller may also wish to avoid waking kswapd. - -__GFP_IO allocation requests are made to prevent file system deadlocks. - -In the absence of non sleepable allocation requests, it seems detrimental -to be doing balancing. Page reclamation can be kicked off lazily, that -is, only when needed (aka zone free memory is 0), instead of making it -a proactive process. - -That being said, the kernel should try to fulfill requests for direct -mapped pages from the direct mapped pool, instead of falling back on -the dma pool, so as to keep the dma pool filled for dma requests (atomic -or not). A similar argument applies to highmem and direct mapped pages. -OTOH, if there is a lot of free dma pages, it is preferable to satisfy -regular memory requests by allocating one from the dma pool, instead -of incurring the overhead of regular zone balancing. - -In 2.2, memory balancing/page reclamation would kick off only when the -_total_ number of free pages fell below 1/64 th of total memory. With the -right ratio of dma and regular memory, it is quite possible that balancing -would not be done even when the dma zone was completely empty. 2.2 has -been running production machines of varying memory sizes, and seems to be -doing fine even with the presence of this problem. In 2.3, due to -HIGHMEM, this problem is aggravated. - -In 2.3, zone balancing can be done in one of two ways: depending on the -zone size (and possibly of the size of lower class zones), we can decide -at init time how many free pages we should aim for while balancing any -zone. The good part is, while balancing, we do not need to look at sizes -of lower class zones, the bad part is, we might do too frequent balancing -due to ignoring possibly lower usage in the lower class zones. Also, -with a slight change in the allocation routine, it is possible to reduce -the memclass() macro to be a simple equality. - -Another possible solution is that we balance only when the free memory -of a zone _and_ all its lower class zones falls below 1/64th of the -total memory in the zone and its lower class zones. This fixes the 2.2 -balancing problem, and stays as close to 2.2 behavior as possible. Also, -the balancing algorithm works the same way on the various architectures, -which have different numbers and types of zones. If we wanted to get -fancy, we could assign different weights to free pages in different -zones in the future. - -Note that if the size of the regular zone is huge compared to dma zone, -it becomes less significant to consider the free dma pages while -deciding whether to balance the regular zone. The first solution -becomes more attractive then. - -The appended patch implements the second solution. It also "fixes" two -problems: first, kswapd is woken up as in 2.2 on low memory conditions -for non-sleepable allocations. Second, the HIGHMEM zone is also balanced, -so as to give a fighting chance for replace_with_highmem() to get a -HIGHMEM page, as well as to ensure that HIGHMEM allocations do not -fall back into regular zone. This also makes sure that HIGHMEM pages -are not leaked (for example, in situations where a HIGHMEM page is in -the swapcache but is not being used by anyone) - -kswapd also needs to know about the zones it should balance. kswapd is -primarily needed in a situation where balancing can not be done, -probably because all allocation requests are coming from intr context -and all process contexts are sleeping. For 2.3, kswapd does not really -need to balance the highmem zone, since intr context does not request -highmem pages. kswapd looks at the zone_wake_kswapd field in the zone -structure to decide whether a zone needs balancing. - -Page stealing from process memory and shm is done if stealing the page would -alleviate memory pressure on any zone in the page's node that has fallen below -its watermark. - -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These -are per-zone fields, used to determine when a zone needs to be balanced. When -the number of pages falls below watermark[WMARK_MIN], the hysteric field -low_on_memory gets set. This stays set till the number of free pages becomes -watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will -try to free some pages in the zone (providing GFP_WAIT is set in the request). -Orthogonal to this, is the decision to poke kswapd to free some zone pages. -That decision is not hysteresis based, and is done when the number of free -pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. - - -(Good) Ideas that I have heard: - -1. Dynamic experience should influence balancing: number of failed requests - for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net) -2. Implement a replace_with_highmem()-like replace_with_regular() to preserve - dma pages. (lkd@tantalophile.demon.co.uk) diff --git a/Documentation/vm/bootmem.rst b/Documentation/vm/bootmem.rst deleted file mode 100644 index eb2b31eedfa1..000000000000 --- a/Documentation/vm/bootmem.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========== -Boot Memory -=========== diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst deleted file mode 100644 index 08f34df45523..000000000000 --- a/Documentation/vm/damon/api.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============= -API Reference -============= - -Kernel space programs can use every feature of DAMON using below APIs. All you -need to do is including ``damon.h``, which is located in ``include/linux/`` of -the source tree. - -Structures -========== - -.. kernel-doc:: include/linux/damon.h - - -Functions -========= - -.. kernel-doc:: mm/damon/core.c diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst deleted file mode 100644 index 0cff6fac6b7e..000000000000 --- a/Documentation/vm/damon/design.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====== -Design -====== - -Configurable Layers -=================== - -DAMON provides data access monitoring functionality while making the accuracy -and the overhead controllable. The fundamental access monitorings require -primitives that dependent on and optimized for the target address space. On -the other hand, the accuracy and overhead tradeoff mechanism, which is the core -of DAMON, is in the pure logic space. DAMON separates the two parts in -different layers and defines its interface to allow various low level -primitives implementations configurable with the core logic. We call the low -level primitives implementations monitoring operations. - -Due to this separated design and the configurable interface, users can extend -DAMON for any address space by configuring the core logics with appropriate -monitoring operations. If appropriate one is not provided, users can implement -the operations on their own. - -For example, physical memory, virtual memory, swap space, those for specific -processes, NUMA nodes, files, and backing memory devices would be supportable. -Also, if some architectures or devices support special optimized access check -primitives, those will be easily configurable. - - -Reference Implementations of Address Space Specific Monitoring Operations -========================================================================= - -The monitoring operations are defined in two parts: - -1. Identification of the monitoring target address range for the address space. -2. Access check of specific address range in the target space. - -DAMON currently provides the implementations of the operations for the physical -and virtual address spaces. Below two subsections describe how those work. - - -VMA-based Target Address Range Construction -------------------------------------------- - -This is only for the virtual address space monitoring operations -implementation. That for the physical address space simply asks users to -manually set the monitoring target address ranges. - -Only small parts in the super-huge virtual address space of the processes are -mapped to the physical memory and accessed. Thus, tracking the unmapped -address regions is just wasteful. However, because DAMON can deal with some -level of noise using the adaptive regions adjustment mechanism, tracking every -mapping is not strictly required but could even incur a high overhead in some -cases. That said, too huge unmapped areas inside the monitoring target should -be removed to not take the time for the adaptive mechanism. - -For the reason, this implementation converts the complex mappings to three -distinct regions that cover every mapped area of the address space. The two -gaps between the three regions are the two biggest unmapped areas in the given -address space. The two biggest unmapped areas would be the gap between the -heap and the uppermost mmap()-ed region, and the gap between the lowermost -mmap()-ed region and the stack in most of the cases. Because these gaps are -exceptionally huge in usual address spaces, excluding these will be sufficient -to make a reasonable trade-off. Below shows this in detail:: - - - - - (small mmap()-ed regions and munmap()-ed regions) - - - - - -PTE Accessed-bit Based Access Check ------------------------------------ - -Both of the implementations for physical and virtual address spaces use PTE -Accessed-bit for basic access checks. Only one difference is the way of -finding the relevant PTE Accessed bit(s) from the address. While the -implementation for the virtual address walks the page table for the target task -of the address, the implementation for the physical address walks every page -table having a mapping to the address. In this way, the implementations find -and clear the bit(s) for next sampling target address and checks whether the -bit(s) set again after one sampling period. This could disturb other kernel -subsystems using the Accessed bits, namely Idle page tracking and the reclaim -logic. DAMON does nothing to avoid disturbing Idle page tracking, so handling -the interference is the responsibility of sysadmins. However, it solves the -conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, -as Idle page tracking does. - - -Address Space Independent Core Mechanisms -========================================= - -Below four sections describe each of the DAMON core mechanisms and the five -monitoring attributes, ``sampling interval``, ``aggregation interval``, -``update interval``, ``minimum number of regions``, and ``maximum number of -regions``. - - -Access Frequency Monitoring ---------------------------- - -The output of DAMON says what pages are how frequently accessed for a given -duration. The resolution of the access frequency is controlled by setting -``sampling interval`` and ``aggregation interval``. In detail, DAMON checks -access to each page per ``sampling interval`` and aggregates the results. In -other words, counts the number of the accesses to each page. After each -``aggregation interval`` passes, DAMON calls callback functions that previously -registered by users so that users can read the aggregated results and then -clears the results. This can be described in below simple pseudo-code:: - - while monitoring_on: - for page in monitoring_target: - if accessed(page): - nr_accesses[page] += 1 - if time() % aggregation_interval == 0: - for callback in user_registered_callbacks: - callback(monitoring_target, nr_accesses) - for page in monitoring_target: - nr_accesses[page] = 0 - sleep(sampling interval) - -The monitoring overhead of this mechanism will arbitrarily increase as the -size of the target workload grows. - - -Region Based Sampling ---------------------- - -To avoid the unbounded increase of the overhead, DAMON groups adjacent pages -that assumed to have the same access frequencies into a region. As long as the -assumption (pages in a region have the same access frequencies) is kept, only -one page in the region is required to be checked. Thus, for each ``sampling -interval``, DAMON randomly picks one page in each region, waits for one -``sampling interval``, checks whether the page is accessed meanwhile, and -increases the access frequency of the region if so. Therefore, the monitoring -overhead is controllable by setting the number of regions. DAMON allows users -to set the minimum and the maximum number of regions for the trade-off. - -This scheme, however, cannot preserve the quality of the output if the -assumption is not guaranteed. - - -Adaptive Regions Adjustment ---------------------------- - -Even somehow the initial monitoring target regions are well constructed to -fulfill the assumption (pages in same region have similar access frequencies), -the data access pattern can be dynamically changed. This will result in low -monitoring quality. To keep the assumption as much as possible, DAMON -adaptively merges and splits each region based on their access frequency. - -For each ``aggregation interval``, it compares the access frequencies of -adjacent regions and merges those if the frequency difference is small. Then, -after it reports and clears the aggregated access frequency of each region, it -splits each region into two or three regions if the total number of regions -will not exceed the user-specified maximum number of regions after the split. - -In this way, DAMON provides its best-effort quality and minimal overhead while -keeping the bounds users set for their trade-off. - - -Dynamic Target Space Updates Handling -------------------------------------- - -The monitoring target address range could dynamically changed. For example, -virtual memory could be dynamically mapped and unmapped. Physical memory could -be hot-plugged. - -As the changes could be quite frequent in some cases, DAMON allows the -monitoring operations to check dynamic changes including memory mapping changes -and applies it to monitoring operations-related data structures such as the -abstracted monitoring target memory area only for each of a user-specified time -interval (``update interval``). diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst deleted file mode 100644 index dde7e2414ee6..000000000000 --- a/Documentation/vm/damon/faq.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================== -Frequently Asked Questions -========================== - -Why a new subsystem, instead of extending perf or other user space tools? -========================================================================= - -First, because it needs to be lightweight as much as possible so that it can be -used online, any unnecessary overhead such as kernel - user space context -switching cost should be avoided. Second, DAMON aims to be used by other -programs including the kernel. Therefore, having a dependency on specific -tools like perf is not desirable. These are the two biggest reasons why DAMON -is implemented in the kernel space. - - -Can 'idle pages tracking' or 'perf mem' substitute DAMON? -========================================================= - -Idle page tracking is a low level primitive for access check of the physical -address space. 'perf mem' is similar, though it can use sampling to minimize -the overhead. On the other hand, DAMON is a higher-level framework for the -monitoring of various address spaces. It is focused on memory management -optimization and provides sophisticated accuracy/overhead handling mechanisms. -Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of -DAMON's output, but cannot substitute DAMON. - - -Does DAMON support virtual memory only? -======================================= - -No. The core of the DAMON is address space independent. The address space -specific monitoring operations including monitoring target regions -constructions and actual access checks can be implemented and configured on the -DAMON core by the users. In this way, DAMON users can monitor any address -space with any access check technique. - -Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based -implementations of the address space dependent functions for the virtual memory -and the physical memory by default, for a reference and convenient use. - - -Can I simply monitor page granularity? -====================================== - -Yes. You can do so by setting the ``min_nr_regions`` attribute higher than the -working set size divided by the page size. Because the monitoring target -regions size is forced to be ``>=page size``, the region split will make no -effect. diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst deleted file mode 100644 index 48c0bbff98b2..000000000000 --- a/Documentation/vm/damon/index.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================== -DAMON: Data Access MONitor -========================== - -DAMON is a data access monitoring framework subsystem for the Linux kernel. -The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it - - - *accurate* (the monitoring output is useful enough for DRAM level memory - management; It might not appropriate for CPU Cache levels, though), - - *light-weight* (the monitoring overhead is low enough to be applied online), - and - - *scalable* (the upper-bound of the overhead is in constant range regardless - of the size of target workloads). - -Using this framework, therefore, the kernel's memory management mechanisms can -make advanced decisions. Experimental memory management optimization works -that incurring high data accesses monitoring overhead could implemented again. -In user space, meanwhile, users who have some special workloads can write -personalized applications for better understanding and optimizations of their -workloads and systems. - -.. toctree:: - :maxdepth: 2 - - faq - design - api diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst deleted file mode 100644 index 8c05e62d8b2b..000000000000 --- a/Documentation/vm/free_page_reporting.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _free_page_reporting: - -===================== -Free Page Reporting -===================== - -Free page reporting is an API by which a device can register to receive -lists of pages that are currently unused by the system. This is useful in -the case of virtualization where a guest is then able to use this data to -notify the hypervisor that it is no longer using certain pages in memory. - -For the driver, typically a balloon driver, to use of this functionality -it will allocate and initialize a page_reporting_dev_info structure. The -field within the structure it will populate is the "report" function -pointer used to process the scatterlist. It must also guarantee that it can -handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per -call to the function. A call to page_reporting_register will register the -page reporting interface with the reporting framework assuming no other -page reporting devices are already registered. - -Once registered the page reporting API will begin reporting batches of -pages to the driver. The API will start reporting pages 2 seconds after -the interface is registered and will continue to do so 2 seconds after any -page of a sufficiently high order is freed. - -Pages reported will be stored in the scatterlist passed to the reporting -function with the final entry having the end bit set in entry nent - 1. -While pages are being processed by the report function they will not be -accessible to the allocator. Once the report function has been completed -the pages will be returned to the free area from which they were obtained. - -Prior to removing a driver that is making use of free page reporting it -is necessary to call page_reporting_unregister to have the -page_reporting_dev_info structure that is currently in use by free page -reporting removed. Doing this will prevent further reports from being -issued via the interface. If another driver or the same driver is -registered it is possible for it to resume where the previous driver had -left off in terms of reporting free pages. - -Alexander Duyck, Dec 04, 2019 diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst deleted file mode 100644 index feecc5e24477..000000000000 --- a/Documentation/vm/frontswap.rst +++ /dev/null @@ -1,266 +0,0 @@ -.. _frontswap: - -========= -Frontswap -========= - -Frontswap provides a "transcendent memory" interface for swap pages. -In some environments, dramatic performance savings may be obtained because -swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap is so named because it can be thought of as the opposite of -a "backing" store for a swap device. The storage is assumed to be -a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming -to the requirements of transcendent memory (such as Xen's "tmem", or -in-kernel compressed memory, aka "zcache", or future RAM-like devices); -this pseudo-RAM device is not directly accessible or addressable by the -kernel and is of unknown and possibly time-varying size. The driver -links itself to frontswap by calling frontswap_register_ops to set the -frontswap_ops funcs appropriately and the functions it provides must -conform to certain policies as follows: - -An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "store" will -copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "load" will copy the page, if found, -from transcendent memory into kernel memory, but will NOT remove the page -from transcendent memory. An "invalidate_page" will remove the page -from transcendent memory and an "invalidate_area" will remove ALL pages -associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further stores with that swap type. - -Once a page is successfully stored, a matching load on the page will normally -succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the store returns -success, the data has been successfully saved to transcendent memory and -a disk write and, if the data is later read back, a disk read are avoided. -If a store returns failure, transcendent memory has rejected the data, and the -page can be written to swap as usual. - -Note that if a page is stored and the page already exists in transcendent memory -(a "duplicate" store), either the store succeeds and the data is overwritten, -or the store fails AND the page is invalidated. This ensures stale data may -never be obtained from frontswap. - -If properly configured, monitoring of frontswap is done via debugfs in -the `/sys/kernel/debug/frontswap` directory. The effectiveness of -frontswap can be measured (across all swap devices) with: - -``failed_stores`` - how many store attempts have failed - -``loads`` - how many loads were attempted (all should succeed) - -``succ_stores`` - how many store attempts have succeeded - -``invalidates`` - how many invalidates were attempted - -A backend implementation may provide additional metrics. - -FAQ -=== - -* Where's the value? - -When a workload starts swapping, performance falls through the floor. -Frontswap significantly increases performance in many such workloads by -providing a clean, dynamic interface to read and write swap pages to -"transcendent memory" that is otherwise not directly addressable to the kernel. -This interface is ideal when data is transformed to a different form -and size (such as with compression) or secretly moved (as might be -useful for write-balancing for some RAM-like devices). Swap pages (and -evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device". - -Frontswap with a fairly small impact on the kernel, -provides a huge amount of flexibility for more dynamic, flexible RAM -utilization in various system configurations: - -In the single kernel case, aka "zcache", pages are compressed and -stored in local memory, thus increasing the total anonymous pages -that can be safely kept in RAM. Zcache essentially trades off CPU -cycles used in compression/decompression for better memory utilization. -Benchmarks have shown little or no impact when memory pressure is -low while providing a significant performance improvement (25%+) -on some workloads under high memory pressure. - -"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory -support for clustered systems. Frontswap pages are locally compressed -as in zcache, but then "remotified" to another system's RAM. This -allows RAM to be dynamically load-balanced back-and-forth as needed, -i.e. when system A is overcommitted, it can swap to system B, and -vice versa. RAMster can also be configured as a memory server so -many servers in a cluster can swap, dynamically as needed, to a single -server configured with a large amount of RAM... without pre-configuring -how much of the RAM is available for each of the clients! - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to do -it well with no kernel changes have essentially failed (except in some -well-publicized special-case workloads). -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "selfballooning"), sudden unexpected -memory pressure may result in swapping; frontswap allows those pages -to be swapped to and from hypervisor RAM (if overall host system memory -conditions allow), thus mitigating the potentially awful performance impact -of unplanned swapping. - -A KVM implementation is underway and has been RFC'ed to lkml. And, -using frontswap, investigation is also underway on the use of NVM as -a memory extension technology. - -* Sure there may be performance advantages in some situations, but - what's the space/time overhead of frontswap? - -If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into -nothingness and the only overhead is a few extra bytes per swapon'ed -swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" -registers, there is one extra global variable compared to zero for -every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "store" -request (i.e. provides no memory despite claiming it might), -CPU overhead is still negligible -- and since every frontswap fail -precedes a swap page write-to-disk, the system is highly likely -to be I/O bound and using a small fraction of a percent of a CPU -will be irrelevant anyway. - -As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend -registers, one bit is allocated for every swap page for every swap -device that is swapon'd. This is added to the EIGHT bits (which -was sixteen until about 2.6.34) that the kernel already allocates -for every swap page for every swap device that is swapon'd. (Hugh -Dickins has observed that frontswap could probably steal one of -the existing eight bits, but let's worry about that minor optimization -later.) For very large swap disks (which are rare) on a standard -4K pagesize, this is 1MB per 32GB swap. - -When swap pages are stored in transcendent memory instead of written -out to disk, there is a side effect that this may create more memory -pressure that can potentially outweigh the other advantages. A -backend, such as zcache, must implement policies to carefully (but -dynamically) manage memory limits to ensure this doesn't happen. - -* OK, how about a quick overview of what this frontswap patch does - in terms that a kernel hacker can grok? - -Let's assume that a frontswap "backend" has registered during -kernel initialization; this registration indicates that this -frontswap backend has access to some "memory" that is not directly -accessible by the kernel. Exactly how much memory it provides is -entirely dynamic and random. - -Whenever a swap-device is swapon'd frontswap_init() is called, -passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "store" swap pages -associated with that number. - -Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_store is called. Frontswap -consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_store returns -1 and the kernel swaps the page -to the swap device as normal. Note that the response from the frontswap -backend is unpredictable to the kernel; it may choose to never accept a -page, it could accept every ninth page, or it might accept every -page. But if the backend does accept a page, the data from the page -has already been copied and associated with the type and offset, -and the backend guarantees the persistence of the data. In this case, -frontswap sets a bit in the "frontswap_map" for the swap device -corresponding to the page offset on the swap device to which it would -otherwise have written the data. - -When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_load() which checks the frontswap_map to -see if the page was earlier accepted by the frontswap backend. If -it was, the page of data is filled from the frontswap backend and -the swap-in is complete. If not, the normal swap-in code is -executed to obtain the page of data from the real swap device. - -So every time the frontswap backend accepts a page, a swap device read -and (potentially) a swap device write are replaced by a "frontswap backend -store" and (possibly) a "frontswap backend loads", which are presumably much -faster. - -* Can't frontswap be configured as a "special" swap device that is - just higher priority than any real swap device (e.g. like zswap, - or maybe swap-over-nbd/NFS)? - -No. First, the existing swap subsystem doesn't allow for any kind of -swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, -but this would require fairly drastic changes. Even if it were -rewritten, the existing swap subsystem uses the block I/O layer which -assumes a swap device is fixed size and any page in it is linearly -addressable. Frontswap barely touches the existing swap subsystem, -and works around the constraints of the block I/O subsystem to provide -a great deal of flexibility and dynamicity. - -For example, the acceptance of any swap page by the frontswap backend is -entirely unpredictable. This is critical to the definition of frontswap -backends because it grants completely dynamic discretion to the -backend. In zcache, one cannot know a priori how compressible a page is. -"Poorly" compressible pages can be rejected, and "poorly" can itself be -defined dynamically depending on current memory constraints. - -Further, frontswap is entirely synchronous whereas a real swap -device is, by definition, asynchronous and uses block I/O. The -block I/O layer is not only unnecessary, but may perform "optimizations" -that are inappropriate for a RAM-oriented device including delaying -the write of some pages for a significant amount of time. Synchrony is -required to ensure the dynamicity of the backend and to avoid thorny race -conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "store" -and "load" operations need be synchronous. A separate asynchronous thread -is free to manipulate the pages stored by frontswap. For example, -the "remotification" thread in RAMster uses standard asynchronous -kernel sockets to move compressed frontswap pages to a remote machine. -Similarly, a KVM guest-side implementation could do in-guest compression -and use "batched" hypercalls. - -In a virtualized environment, the dynamicity allows the hypervisor -(or host OS) to do "intelligent overcommit". For example, it can -choose to accept pages only until host-swapping might be imminent, -then force guests to do their own swapping. - -There is a downside to the transcendent memory specifications for -frontswap: Since any "store" might fail, there must always be a real -slot on a real swap device to swap the page. Thus frontswap must be -implemented as a "shadow" to every swapon'd device with the potential -capability of holding every page that the swap device might have held -and the possibility that it might hold no pages at all. This means -that frontswap cannot contain more pages than the total of swapon'd -swap devices. For example, if NO swap device is configured on some -installation, frontswap is useless. Swapless portable devices -can still use frontswap but a backend for such devices must configure -some kind of "ghost" swap device and ensure that it is never used. - -* Why this weird definition about "duplicate stores"? If a page - has been previously successfully stored, can't it always be - successfully overwritten? - -Nearly always it can, but no, sometimes it cannot. Consider an example -where data is compressed and the original 4K page has been compressed -to 1K. Now an attempt is made to overwrite the page with data that -is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the store must be rejected. Whenever -frontswap rejects a store that would overwrite, it also must invalidate -the old data and ensure that it is no longer accessible. Since the -swap subsystem then writes the new data to the read swap device, -this is the correct course of action to ensure coherency. - -* Why does the frontswap patch create the new include file swapfile.h? - -The frontswap code depends on some swap-subsystem-internal data -structures that have, over the years, moved back and forth between -static and global. This seemed a reasonable compromise: Define -them as global but declare them in a new include file that isn't -included by the large number of source files that include swap.h. - -Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst deleted file mode 100644 index c9887f241c6c..000000000000 --- a/Documentation/vm/highmem.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. _highmem: - -==================== -High Memory Handling -==================== - -By: Peter Zijlstra - -.. contents:: :local: - -What Is High Memory? -==================== - -High memory (highmem) is used when the size of physical memory approaches or -exceeds the maximum size of virtual memory. At that point it becomes -impossible for the kernel to keep all of the available physical memory mapped -at all times. This means the kernel needs to start using temporary mappings of -the pieces of physical memory that it wants to access. - -The part of (physical) memory not covered by a permanent mapping is what we -refer to as 'highmem'. There are various architecture dependent constraints on -where exactly that border lies. - -In the i386 arch, for example, we choose to map the kernel into every process's -VM space so that we don't have to pay the full TLB invalidation costs for -kernel entry/exit. This means the available virtual memory space (4GiB on -i386) has to be divided between user and kernel space. - -The traditional split for architectures using this approach is 3:1, 3GiB for -userspace and the top 1GiB for kernel space:: - - +--------+ 0xffffffff - | Kernel | - +--------+ 0xc0000000 - | | - | User | - | | - +--------+ 0x00000000 - -This means that the kernel can at most map 1GiB of physical memory at any one -time, but because we need virtual address space for other things - including -temporary maps to access the rest of the physical memory - the actual direct -map will typically be less (usually around ~896MiB). - -Other architectures that have mm context tagged TLBs can have separate kernel -and user maps. Some hardware (like some ARMs), however, have limited virtual -space when they use mm context tags. - - -Temporary Virtual Mappings -========================== - -The kernel contains several ways of creating temporary mappings. The following -list shows them in order of preference of use. - -* kmap_local_page(). This function is used to require short term mappings. - It can be invoked from any context (including interrupts) but the mappings - can only be used in the context which acquired them. - - This function should be preferred, where feasible, over all the others. - - These mappings are thread-local and CPU-local, meaning that the mapping - can only be accessed from within this thread and the thread is bound the - CPU while the mapping is active. Even if the thread is preempted (since - preemption is never disabled by the function) the CPU can not be - unplugged from the system via CPU-hotplug until the mapping is disposed. - - It's valid to take pagefaults in a local kmap region, unless the context - in which the local mapping is acquired does not allow it for other reasons. - - kmap_local_page() always returns a valid virtual address and it is assumed - that kunmap_local() will never fail. - - Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain - extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered - because the map implementation is stack based. See kmap_local_page() kdocs - (included in the "Functions" section) for details on how to manage nested - mappings. - -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. - - kmap_atomic() may also be used by interrupt contexts, since it does not - sleep and the callers too may not sleep until after kunmap_atomic() is - called. - - Each call of kmap_atomic() in the kernel creates a non-preemptible section - and disable pagefaults. This could be a source of unwanted latency. Therefore - users should prefer kmap_local_page() instead of kmap_atomic(). - - It is assumed that k[un]map_atomic() won't fail. - -* kmap(). This should be used to make short duration mapping of a single - page with no restrictions on preemption or migration. It comes with an - overhead as mapping space is restricted and protected by a global lock - for synchronization. When mapping is no longer needed, the address that - the page was mapped to must be released with kunmap(). - - Mapping changes must be propagated across all the CPUs. kmap() also - requires global TLB invalidation when the kmap's pool wraps and it might - block when the mapping space is fully utilized until a slot becomes - available. Therefore, kmap() is only callable from preemptible context. - - All the above work is necessary if a mapping must last for a relatively - long time but the bulk of high-memory mappings in the kernel are - short-lived and only used in one place. This means that the cost of - kmap() is mostly wasted in such cases. kmap() was not intended for long - term mappings but it has morphed in that direction and its use is - strongly discouraged in newer code and the set of the preceding functions - should be preferred. - - On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have - no real work to do because a 64-bit address space is more than sufficient to - address all the physical memory whose pages are permanently mapped. - -* vmap(). This can be used to make a long duration mapping of multiple - physical pages into a contiguous virtual space. It needs global - synchronization to unmap. - - -Cost of Temporary Mappings -========================== - -The cost of creating temporary mappings can be quite high. The arch has to -manipulate the kernel's page tables, the data TLB and/or the MMU's registers. - -If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping -simply with a bit of arithmetic that will convert the page struct address into -a pointer to the page contents rather than juggling mappings about. In such a -case, the unmap operation may be a null operation. - -If CONFIG_MMU is not set, then there can be no temporary mappings and no -highmem. In such a case, the arithmetic approach will also be used. - - -i386 PAE -======== - -The i386 arch, under some circumstances, will permit you to stick up to 64GiB -of RAM into your 32-bit machine. This has a number of consequences: - -* Linux needs a page-frame structure for each page in the system and the - pageframes need to live in the permanent mapping, which means: - -* you can have 896M/sizeof(struct page) page-frames at most; with struct - page being 32-bytes that would end up being something in the order of 112G - worth of pages; the kernel, however, needs to store more than just - page-frames in that memory... - -* PAE makes your page tables larger - which slows the system down as more - data has to be accessed to traverse in TLB fills and the like. One - advantage is that PAE has more PTE bits and can provide advanced features - like NX and PAT. - -The general recommendation is that you don't use more than 8GiB on a 32-bit -machine - although more might work for you and your workload, you're pretty -much on your own - don't expect kernel developers to really care much if things -come apart. - - -Functions -========= - -.. kernel-doc:: include/linux/highmem.h -.. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst deleted file mode 100644 index f2a59ed82ed3..000000000000 --- a/Documentation/vm/hmm.rst +++ /dev/null @@ -1,452 +0,0 @@ -.. _hmm: - -===================================== -Heterogeneous Memory Management (HMM) -===================================== - -Provide infrastructure and helpers to integrate non-conventional memory (device -memory like GPU on board memory) into regular kernel path, with the cornerstone -of this being specialized struct page for such memory (see sections 5 to 7 of -this document). - -HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., -allowing a device to transparently access program addresses coherently with -the CPU meaning that any valid pointer on the CPU is also a valid pointer -for the device. This is becoming mandatory to simplify the use of advanced -heterogeneous computing where GPU, DSP, or FPGA are used to perform various -computations on behalf of a process. - -This document is divided as follows: in the first section I expose the problems -related to using device specific memory allocators. In the second section, I -expose the hardware limitations that are inherent to many platforms. The third -section gives an overview of the HMM design. The fourth section explains how -CPU page-table mirroring works and the purpose of HMM in this context. The -fifth section deals with how device memory is represented inside the kernel. -Finally, the last section presents a new migration helper that allows -leveraging the device DMA engine. - -.. contents:: :local: - -Problems of using a device specific memory allocator -==================================================== - -Devices with a large amount of on board memory (several gigabytes) like GPUs -have historically managed their memory through dedicated driver specific APIs. -This creates a disconnect between memory allocated and managed by a device -driver and regular application memory (private anonymous, shared memory, or -regular file backed memory). From here on I will refer to this aspect as split -address space. I use shared address space to refer to the opposite situation: -i.e., one in which any application memory region can be used by a device -transparently. - -Split address space happens because devices can only access memory allocated -through a device specific API. This implies that all memory objects in a program -are not equal from the device point of view which complicates large programs -that rely on a wide set of libraries. - -Concretely, this means that code that wants to leverage devices like GPUs needs -to copy objects between generically allocated memory (malloc, mmap private, mmap -share) and memory allocated through the device driver API (this still ends up -with an mmap but of the device file). - -For flat data sets (array, grid, image, ...) this isn't too hard to achieve but -for complex data sets (list, tree, ...) it's hard to get right. Duplicating a -complex data set needs to re-map all the pointer relations between each of its -elements. This is error prone and programs get harder to debug because of the -duplicate data set and addresses. - -Split address space also means that libraries cannot transparently use data -they are getting from the core program or another library and thus each library -might have to duplicate its input data set using the device specific memory -allocator. Large projects suffer from this and waste resources because of the -various memory copies. - -Duplicating each library API to accept as input or output memory allocated by -each device specific allocator is not a viable option. It would lead to a -combinatorial explosion in the library entry points. - -Finally, with the advance of high level language constructs (in C++ but in -other languages too) it is now possible for the compiler to leverage GPUs and -other devices without programmer knowledge. Some compiler identified patterns -are only do-able with a shared address space. It is also more reasonable to use -a shared address space for all other patterns. - - -I/O bus, device memory characteristics -====================================== - -I/O buses cripple shared address spaces due to a few limitations. Most I/O -buses only allow basic memory access from device to main memory; even cache -coherency is often optional. Access to device memory from a CPU is even more -limited. More often than not, it is not cache coherent. - -If we only consider the PCIE bus, then a device can access main memory (often -through an IOMMU) and be cache coherent with the CPUs. However, it only allows -a limited set of atomic operations from the device on main memory. This is worse -in the other direction: the CPU can only access a limited range of the device -memory and cannot perform atomic operations on it. Thus device memory cannot -be considered the same as regular memory from the kernel point of view. - -Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0 -and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s). -The final limitation is latency. Access to main memory from the device has an -order of magnitude higher latency than when the device accesses its own memory. - -Some platforms are developing new I/O buses or additions/modifications to PCIE -to address some of these limitations (OpenCAPI, CCIX). They mainly allow -two-way cache coherency between CPU and device and allow all atomic operations the -architecture supports. Sadly, not all platforms are following this trend and -some major architectures are left without hardware solutions to these problems. - -So for shared address space to make sense, not only must we allow devices to -access any memory but we must also permit any memory to be migrated to device -memory while the device is using it (blocking CPU access while it happens). - - -Shared address space and migration -================================== - -HMM intends to provide two main features. The first one is to share the address -space by duplicating the CPU page table in the device page table so the same -address points to the same physical memory for any valid main memory address in -the process address space. - -To achieve this, HMM offers a set of helpers to populate the device page table -while keeping track of CPU page table updates. Device page table updates are -not as easy as CPU page table updates. To update the device page table, you must -allocate a buffer (or use a pool of pre-allocated buffers) and write GPU -specific commands in it to perform the update (unmap, cache invalidations, and -flush, ...). This cannot be done through common code for all devices. Hence -why HMM provides helpers to factor out everything that can be while leaving the -hardware specific details to the device driver. - -The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that -allows allocating a struct page for each page of device memory. Those pages -are special because the CPU cannot map them. However, they allow migrating -main memory to device memory using existing migration mechanisms and everything -looks like a page that is swapped out to disk from the CPU point of view. Using a -struct page gives the easiest and cleanest integration with existing mm -mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE -memory for the device memory and second to perform migration. Policy decisions -of what and when to migrate is left to the device driver. - -Note that any CPU access to a device page triggers a page fault and a migration -back to main memory. For example, when a page backing a given CPU address A is -migrated from a main memory page to a device page, then any CPU access to -address A triggers a page fault and initiates a migration back to main memory. - -With these two features, HMM not only allows a device to mirror process address -space and keeps both CPU and device page tables synchronized, but also -leverages device memory by migrating the part of the data set that is actively being -used by the device. - - -Address space mirroring implementation and API -============================================== - -Address space mirroring's main objective is to allow duplication of a range of -CPU page table into a device page table; HMM helps keep both synchronized. A -device driver that wants to mirror a process address space must start with the -registration of a mmu_interval_notifier:: - - int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, - struct mm_struct *mm, unsigned long start, - unsigned long length, - const struct mmu_interval_notifier_ops *ops); - -During the ops->invalidate() callback the device driver must perform the -update action to the range (mark range read only, or fully unmap, etc.). The -device must complete the update before the driver callback returns. - -When the device driver wants to populate a range of virtual addresses, it can -use:: - - int hmm_range_fault(struct hmm_range *range); - -It will trigger a page fault on missing or read-only entries if write access is -requested (see below). Page faults use the generic mm page fault code path just -like a CPU page fault. - -Both functions copy CPU page table entries into their pfns array argument. Each -entry in that array corresponds to an address in the virtual range. HMM -provides a set of flags to help the driver identify special CPU page table -entries. - -Locking within the sync_cpu_device_pagetables() callback is the most important -aspect the driver must respect in order to keep things properly synchronized. -The usage pattern is:: - - int driver_populate_range(...) - { - struct hmm_range range; - ... - - range.notifier = &interval_sub; - range.start = ...; - range.end = ...; - range.hmm_pfns = ...; - - if (!mmget_not_zero(interval_sub->notifier.mm)) - return -EFAULT; - - again: - range.notifier_seq = mmu_interval_read_begin(&interval_sub); - mmap_read_lock(mm); - ret = hmm_range_fault(&range); - if (ret) { - mmap_read_unlock(mm); - if (ret == -EBUSY) - goto again; - return ret; - } - mmap_read_unlock(mm); - - take_lock(driver->update); - if (mmu_interval_read_retry(&ni, range.notifier_seq) { - release_lock(driver->update); - goto again; - } - - /* Use pfns array content to update device page table, - * under the update lock */ - - release_lock(driver->update); - return 0; - } - -The driver->update lock is the same lock that the driver takes inside its -invalidate() callback. That lock must be held before calling -mmu_interval_read_retry() to avoid any race with a concurrent CPU page table -update. - -Leverage default_flags and pfn_flags_mask -========================================= - -The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify -fault or snapshot policy for the whole range instead of having to set them -for each entry in the pfns array. - -For instance if the device driver wants pages for a range with at least read -permission, it sets:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = 0; - -and calls hmm_range_fault() as described above. This will fill fault all pages -in the range with at least read permission. - -Now let's say the driver wants to do the same except for one page in the range for -which it wants to have write permission. Now driver set:: - - range->default_flags = HMM_PFN_REQ_FAULT; - range->pfn_flags_mask = HMM_PFN_REQ_WRITE; - range->pfns[index_of_write] = HMM_PFN_REQ_WRITE; - -With this, HMM will fault in all pages with at least read (i.e., valid) and for the -address == range->start + (index_of_write << PAGE_SHIFT) it will fault with -write permission i.e., if the CPU pte does not have write permission set then HMM -will call handle_mm_fault(). - -After hmm_range_fault completes the flag bits are set to the current state of -the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is -writable. - - -Represent and manage device memory from core kernel point of view -================================================================= - -Several different designs were tried to support device memory. The first one -used a device specific data structure to keep information about migrated memory -and HMM hooked itself in various places of mm code to handle any access to -addresses that were backed by device memory. It turns out that this ended up -replicating most of the fields of struct page and also needed many kernel code -paths to be updated to understand this new kind of memory. - -Most kernel code paths never try to access the memory behind a page -but only care about struct page contents. Because of this, HMM switched to -directly using struct page for device memory which left most kernel code paths -unaware of the difference. We only need to make sure that no one ever tries to -map those pages from the CPU side. - -Migration to and from device memory -=================================== - -Because the CPU cannot access device memory directly, the device driver must -use hardware DMA or device specific load/store instructions to migrate data. -The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() -functions are designed to make drivers easier to write and to centralize common -code across drivers. - -Before migrating pages to device private memory, special device private -``struct page`` need to be created. These will be used as special "swap" -page table entries so that a CPU process will fault if it tries to access -a page that has been migrated to device private memory. - -These can be allocated and freed with:: - - struct resource *res; - struct dev_pagemap pagemap; - - res = request_free_mem_region(&iomem_resource, /* number of bytes */, - "name of driver resource"); - pagemap.type = MEMORY_DEVICE_PRIVATE; - pagemap.range.start = res->start; - pagemap.range.end = res->end; - pagemap.nr_range = 1; - pagemap.ops = &device_devmem_ops; - memremap_pages(&pagemap, numa_node_id()); - - memunmap_pages(&pagemap); - release_mem_region(pagemap.range.start, range_len(&pagemap.range)); - -There are also devm_request_free_mem_region(), devm_memremap_pages(), -devm_memunmap_pages(), and devm_release_mem_region() when the resources can -be tied to a ``struct device``. - -The overall migration steps are similar to migrating NUMA pages within system -memory (see :ref:`Page migration `) but the steps are split -between device driver specific code and shared common code: - -1. ``mmap_read_lock()`` - - The device driver has to pass a ``struct vm_area_struct`` to - migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to - be held for the duration of the migration. - -2. ``migrate_vma_setup(struct migrate_vma *args)`` - - The device driver initializes the ``struct migrate_vma`` fields and passes - the pointer to migrate_vma_setup(). The ``args->flags`` field is used to - filter which source pages should be migrated. For example, setting - ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and - ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in - device private memory. If the latter flag is set, the ``args->pgmap_owner`` - field is used to identify device private pages owned by the driver. This - avoids trying to migrate device private pages residing in other devices. - Currently only anonymous private VMA ranges can be migrated to or from - system memory and device private memory. - - One of the first steps migrate_vma_setup() does is to invalidate other - device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and - ``mmu_notifier_invalidate_range_end()`` calls around the page table - walks to fill in the ``args->src`` array with PFNs to be migrated. - The ``invalidate_range_start()`` callback is passed a - ``struct mmu_notifier_range`` with the ``event`` field set to - ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to - the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is - allows the device driver to skip the invalidation callback and only - invalidate device private MMU mappings that are actually migrating. - This is explained more in the next section. - - While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()`` - entry results in a valid "zero" PFN stored in the ``args->src`` array. - This lets the driver allocate device private memory and clear it instead - of copying a page of zeros. Valid PTE entries to system memory or - device private struct pages will be locked with ``lock_page()``, isolated - from the LRU (if system memory since device private pages are not on - the LRU), unmapped from the process, and a special migration PTE is - inserted in place of the original PTE. - migrate_vma_setup() also clears the ``args->dst`` array. - -3. The device driver allocates destination pages and copies source pages to - destination pages. - - The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE`` - bit is set and skips entries that are not migrating. The device driver - can also choose to skip migrating a page by not filling in the ``dst`` - array for that page. - - The driver then allocates either a device private struct page or a - system memory page, locks the page with ``lock_page()``, and fills in the - ``dst`` array entry with:: - - dst[i] = migrate_pfn(page_to_pfn(dpage)); - - Now that the driver knows that this page is being migrated, it can - invalidate device private MMU mappings and copy device private memory - to system memory or another device private page. The core Linux kernel - handles CPU page table invalidations so the device driver only has to - invalidate its own MMU mappings. - - The driver can use ``migrate_pfn_to_page(src[i])`` to get the - ``struct page`` of the source and either copy the source page to the - destination or clear the destination device private memory if the pointer - is ``NULL`` meaning the source page was not populated in system memory. - -4. ``migrate_vma_pages()`` - - This step is where the migration is actually "committed". - - If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this - is where the newly allocated page is inserted into the CPU's page table. - This can fail if a CPU thread faults on the same page. However, the page - table is locked and only one of the new pages will be inserted. - The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared - if it loses the race. - - If the source page was locked, isolated, etc. the source ``struct page`` - information is now copied to destination ``struct page`` finalizing the - migration on the CPU side. - -5. Device driver updates device MMU page tables for pages still migrating, - rolling back pages not migrating. - - If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device - driver can update the device MMU and set the write enable bit if the - ``MIGRATE_PFN_WRITE`` bit is set. - -6. ``migrate_vma_finalize()`` - - This step replaces the special migration page table entry with the new - page's page table entry and releases the reference to the source and - destination ``struct page``. - -7. ``mmap_read_unlock()`` - - The lock can now be released. - -Exclusive access memory -======================= - -Some devices have features such as atomic PTE bits that can be used to implement -atomic access to system memory. To support atomic operations to a shared virtual -memory page such a device needs access to that page which is exclusive of any -userspace access from the CPU. The ``make_device_exclusive_range()`` function -can be used to make a memory range inaccessible from userspace. - -This replaces all mappings for pages in the given range with special swap -entries. Any attempt to access the swap entry results in a fault which is -resovled by replacing the entry with the original mapping. A driver gets -notified that the mapping has been changed by MMU notifiers, after which point -it will no longer have exclusive access to the page. Exclusive access is -guranteed to last until the driver drops the page lock and page reference, at -which point any CPU faults on the page may proceed as described. - -Memory cgroup (memcg) and rss accounting -======================================== - -For now, device memory is accounted as any regular page in rss counters (either -anonymous if device page is used for anonymous, file if device page is used for -file backed page, or shmem if device page is used for shared memory). This is a -deliberate choice to keep existing applications, that might start using device -memory without knowing about it, running unimpacted. - -A drawback is that the OOM killer might kill an application using a lot of -device memory and not a lot of regular system memory and thus not freeing much -system memory. We want to gather more real world experience on how applications -and system react under memory pressure in the presence of device memory before -deciding to account device memory differently. - - -Same decision was made for memory cgroup. Device memory pages are accounted -against same memory cgroup a regular page would be accounted to. This does -simplify migration to and from device memory. This also means that migration -back from device memory to regular memory cannot fail because it would -go above memory cgroup limit. We might revisit this choice latter on once we -get more experience in how device memory is used and its impact on memory -resource control. - - -Note that device memory can never be pinned by a device driver nor through GUP -and thus such memory is always free upon process exit. Or when last reference -is dropped in case of shared memory or file backed memory. diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst deleted file mode 100644 index f143954e0d05..000000000000 --- a/Documentation/vm/hugetlbfs_reserv.rst +++ /dev/null @@ -1,596 +0,0 @@ -.. _hugetlbfs_reserve: - -===================== -Hugetlbfs Reservation -===================== - -Overview -======== - -Huge pages as described at :ref:`hugetlbpage` are typically -preallocated for application use. These huge pages are instantiated in a -task's address space at page fault time if the VMA indicates huge pages are -to be used. If no huge page exists at page fault time, the task is sent -a SIGBUS and often dies an unhappy death. Shortly after huge page support -was added, it was determined that it would be better to detect a shortage -of huge pages at mmap() time. The idea is that if there were not enough -huge pages to cover the mapping, the mmap() would fail. This was first -done with a simple check in the code at mmap() time to determine if there -were enough free huge pages to cover the mapping. Like most things in the -kernel, the code has evolved over time. However, the basic idea was to -'reserve' huge pages at mmap() time to ensure that huge pages would be -available for page faults in that mapping. The description below attempts to -describe how huge page reserve processing is done in the v4.10 kernel. - - -Audience -======== -This description is primarily targeted at kernel developers who are modifying -hugetlbfs code. - - -The Data Structures -=================== - -resv_huge_pages - This is a global (per-hstate) count of reserved huge pages. Reserved - huge pages are only available to the task which reserved them. - Therefore, the number of huge pages generally available is computed - as (``free_huge_pages - resv_huge_pages``). -Reserve Map - A reserve map is described by the structure:: - - struct resv_map { - struct kref refs; - spinlock_t lock; - struct list_head regions; - long adds_in_progress; - struct list_head region_cache; - long region_cache_count; - }; - - There is one reserve map for each huge page mapping in the system. - The regions list within the resv_map describes the regions within - the mapping. A region is described as:: - - struct file_region { - struct list_head link; - long from; - long to; - }; - - The 'from' and 'to' fields of the file region structure are huge page - indices into the mapping. Depending on the type of mapping, a - region in the reserv_map may indicate reservations exist for the - range, or reservations do not exist. -Flags for MAP_PRIVATE Reservations - These are stored in the bottom bits of the reservation map pointer. - - ``#define HPAGE_RESV_OWNER (1UL << 0)`` - Indicates this task is the owner of the reservations - associated with the mapping. - ``#define HPAGE_RESV_UNMAPPED (1UL << 1)`` - Indicates task originally mapping this range (and creating - reserves) has unmapped a page from this task (the child) - due to a failed COW. -Page Flags - The PagePrivate page flag is used to indicate that a huge page - reservation must be restored when the huge page is freed. More - details will be discussed in the "Freeing huge pages" section. - - -Reservation Map Location (Private or Shared) -============================================ - -A huge page mapping or segment is either private or shared. If private, -it is typically only available to a single address space (task). If shared, -it can be mapped into multiple address spaces (tasks). The location and -semantics of the reservation map is significantly different for the two types -of mappings. Location differences are: - -- For private mappings, the reservation map hangs off the VMA structure. - Specifically, vma->vm_private_data. This reserve map is created at the - time the mapping (mmap(MAP_PRIVATE)) is created. -- For shared mappings, the reservation map hangs off the inode. Specifically, - inode->i_mapping->private_data. Since shared mappings are always backed - by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode - contains a reservation map. As a result, the reservation map is allocated - when the inode is created. - - -Creating Reservations -===================== -Reservations are created when a huge page backed shared memory segment is -created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). -These operations result in a call to the routine hugetlb_reserve_pages():: - - int hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) - -The first thing hugetlb_reserve_pages() does is check if the NORESERVE -flag was specified in either the shmget() or mmap() call. If NORESERVE -was specified, then this routine returns immediately as no reservations -are desired. - -The arguments 'from' and 'to' are huge page indices into the mapping or -underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to -the length of the segment/mapping. For mmap(), the offset argument could -be used to specify the offset into the underlying file. In such a case, -the 'from' and 'to' arguments have been adjusted by this offset. - -One of the big differences between PRIVATE and SHARED mappings is the way -in which reservations are represented in the reservation map. - -- For shared mappings, an entry in the reservation map indicates a reservation - exists or did exist for the corresponding page. As reservations are - consumed, the reservation map is not modified. -- For private mappings, the lack of an entry in the reservation map indicates - a reservation exists for the corresponding page. As reservations are - consumed, entries are added to the reservation map. Therefore, the - reservation map can also be used to determine which reservations have - been consumed. - -For private mappings, hugetlb_reserve_pages() creates the reservation map and -hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set -to indicate this VMA owns the reservations. - -The reservation map is consulted to determine how many huge page reservations -are needed for the current mapping/segment. For private mappings, this is -always the value (to - from). However, for shared mappings it is possible that -some reservations may already exist within the range (to - from). See the -section :ref:`Reservation Map Modifications ` -for details on how this is accomplished. - -The mapping may be associated with a subpool. If so, the subpool is consulted -to ensure there is sufficient space for the mapping. It is possible that the -subpool has set aside reservations that can be used for the mapping. See the -section :ref:`Subpool Reservations ` for more details. - -After consulting the reservation map and subpool, the number of needed new -reservations is known. The routine hugetlb_acct_memory() is called to check -for and take the requested number of reservations. hugetlb_acct_memory() -calls into routines that potentially allocate and adjust surplus page counts. -However, within those routines the code is simply checking to ensure there -are enough free huge pages to accommodate the reservation. If there are, -the global reservation count resv_huge_pages is adjusted something like the -following:: - - if (resv_needed <= (resv_huge_pages - free_huge_pages)) - resv_huge_pages += resv_needed; - -Note that the global lock hugetlb_lock is held when checking and adjusting -these counters. - -If there were enough free huge pages and the global count resv_huge_pages -was adjusted, then the reservation map associated with the mapping is -modified to reflect the reservations. In the case of a shared mapping, a -file_region will exist that includes the range 'from' - 'to'. For private -mappings, no modifications are made to the reservation map as lack of an -entry indicates a reservation exists. - -If hugetlb_reserve_pages() was successful, the global reservation count and -reservation map associated with the mapping will be modified as required to -ensure reservations exist for the range 'from' - 'to'. - -.. _consume_resv: - -Consuming Reservations/Allocating a Huge Page -============================================= - -Reservations are consumed when huge pages associated with the reservations -are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: - - struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) - -alloc_huge_page is passed a VMA pointer and a virtual address, so it can -consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves -should not be used even if it appears they have been set aside for the -specified address. The avoid_reserve argument is most often used in the case -of Copy on Write and Page Migration where additional copies of an existing -page are being allocated. - -The helper routine vma_needs_reservation() is called to determine if a -reservation exists for the address within the mapping(vma). See the section -:ref:`Reservation Map Helper Routines ` for detailed -information on what this routine does. -The value returned from vma_needs_reservation() is generally -0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. -If a reservation does not exist, and there is a subpool associated with the -mapping the subpool is consulted to determine if it contains reservations. -If the subpool contains reservations, one can be used for this allocation. -However, in every case the avoid_reserve argument overrides the use of -a reservation for the allocation. After determining whether a reservation -exists and can be used for the allocation, the routine dequeue_huge_page_vma() -is called. This routine takes two arguments related to reservations: - -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() -- chg, even though this argument is of type long only the values 0 or 1 are - passed to dequeue_huge_page_vma. If the value is 0, it indicates a - reservation exists (see the section "Memory Policy and Reservations" for - possible issues). If the value is 1, it indicates a reservation does not - exist and the page must be taken from the global free pool if possible. - -The free lists associated with the memory policy of the VMA are searched for -a free page. If a page is found, the value free_huge_pages is decremented -when the page is removed from the free list. If there was a reservation -associated with the page, the following adjustments are made:: - - SetPagePrivate(page); /* Indicates allocating this page consumed - * a reservation, and if an error is - * encountered such that the page must be - * freed, the reservation will be restored. */ - resv_huge_pages--; /* Decrement the global reservation count */ - -Note, if no huge page can be found that satisfies the VMA's memory policy -an attempt will be made to allocate one using the buddy allocator. This -brings up the issue of surplus huge pages and overcommit which is beyond -the scope reservations. Even if a surplus page is allocated, the same -reservation based adjustments as above will be made: SetPagePrivate(page) and -resv_huge_pages--. - -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. - -The routine vma_commit_reservation() is then called to adjust the reserve -map based on the consumption of the reservation. In general, this involves -ensuring the page is represented within a file_region structure of the region -map. For shared mappings where the reservation was present, an entry -in the reserve map already existed so no change is made. However, if there -was no reservation in a shared mapping or this was a private mapping a new -entry must be created. - -It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would -be possible if hugetlb_reserve_pages was called for the same page in a shared -mapping. In such cases, the reservation count and subpool free page count -will be off by one. This rare condition can be identified by comparing the -return value from vma_needs_reservation and vma_commit_reservation. If such -a race is detected, the subpool and global reserve counts are adjusted to -compensate. See the section -:ref:`Reservation Map Helper Routines ` for more -information on these routines. - - -Instantiate Huge Pages -====================== - -After huge page allocation, the page is typically added to the page tables -of the allocating task. Before this, pages in a shared mapping are added -to the page cache and pages in private mappings are added to an anonymous -reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, -when a huge page that has been instantiated is freed no adjustment is made -to the global reservation count (resv_huge_pages). - - -Freeing Huge Pages -================== - -Huge page freeing is performed by the routine free_huge_page(). This routine -is the destructor for hugetlbfs compound pages. As a result, it is only -passed a pointer to the page struct. When a huge page is freed, reservation -accounting may need to be performed. This would be the case if the page was -associated with a subpool that contained reserves, or the page is being freed -on an error path where a global reserve count must be restored. - -The page->private field points to any subpool associated with the page. -If the PagePrivate flag is set, it indicates the global reserve count should -be adjusted (see the section -:ref:`Consuming Reservations/Allocating a Huge Page ` -for information on how these are set). - -The routine first calls hugepage_subpool_put_pages() for the page. If this -routine returns a value of 0 (which does not equal the value passed 1) it -indicates reserves are associated with the subpool, and this newly free page -must be used to keep the number of subpool reserves above the minimum size. -Therefore, the global resv_huge_pages counter is incremented in this case. - -If the PagePrivate flag was set in the page, the global resv_huge_pages counter -will always be incremented. - -.. _sub_pool_resv: - -Subpool Reservations -==================== - -There is a struct hstate associated with each huge page size. The hstate -tracks all huge pages of the specified size. A subpool represents a subset -of pages within a hstate that is associated with a mounted hugetlbfs -filesystem. - -When a hugetlbfs filesystem is mounted a min_size option can be specified -which indicates the minimum number of huge pages required by the filesystem. -If this option is specified, the number of huge pages corresponding to -min_size are reserved for use by the filesystem. This number is tracked in -the min_hpages field of a struct hugepage_subpool. At mount time, -hugetlb_acct_memory(min_hpages) is called to reserve the specified number of -huge pages. If they can not be reserved, the mount fails. - -The routines hugepage_subpool_get/put_pages() are called when pages are -obtained from or released back to a subpool. They perform all subpool -accounting, and track any reservations associated with the subpool. -hugepage_subpool_get/put_pages are passed the number of huge pages by which -to adjust the subpool 'used page' count (down for get, up for put). Normally, -they return the same value that was passed or an error if not enough pages -exist in the subpool. - -However, if reserves are associated with the subpool a return value less -than the passed value may be returned. This return value indicates the -number of additional global pool adjustments which must be made. For example, -suppose a subpool contains 3 reserved huge pages and someone asks for 5. -The 3 reserved pages associated with the subpool can be used to satisfy part -of the request. But, 2 pages must be obtained from the global pools. To -relay this information to the caller, the value 2 is returned. The caller -is then responsible for attempting to obtain the additional two pages from -the global pools. - - -COW and Reservations -==================== - -Since shared mappings all point to and use the same underlying pages, the -biggest reservation concern for COW is private mappings. In this case, -two tasks can be pointing at the same previously allocated page. One task -attempts to write to the page, so a new page must be allocated so that each -task points to its own page. - -When the page was originally allocated, the reservation for that page was -consumed. When an attempt to allocate a new page is made as a result of -COW, it is possible that no free huge pages are free and the allocation -will fail. - -When the private mapping was originally created, the owner of the mapping -was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation -map of the owner. Since the owner created the mapping, the owner owns all -the reservations associated with the mapping. Therefore, when a write fault -occurs and there is no page available, different action is taken for the owner -and non-owner of the reservation. - -In the case where the faulting task is not the owner, the fault will fail and -the task will typically receive a SIGBUS. - -If the owner is the faulting task, we want it to succeed since it owned the -original reservation. To accomplish this, the page is unmapped from the -non-owning task. In this way, the only reference is from the owning task. -In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer -of the non-owning task. The non-owning task may receive a SIGBUS if it later -faults on a non-present page. But, the original owner of the -mapping/reservation will behave as expected. - - -.. _resv_map_modifications: - -Reservation Map Modifications -============================= - -The following low level routines are used to make modifications to a -reservation map. Typically, these routines are not called directly. Rather, -a reservation map helper routine is called which calls one of these low level -routines. These low level routines are fairly well documented in the source -code (mm/hugetlb.c). These routines are:: - - long region_chg(struct resv_map *resv, long f, long t); - long region_add(struct resv_map *resv, long f, long t); - void region_abort(struct resv_map *resv, long f, long t); - long region_count(struct resv_map *resv, long f, long t); - -Operations on the reservation map typically involve two operations: - -1) region_chg() is called to examine the reserve map and determine how - many pages in the specified range [f, t) are NOT currently represented. - - The calling code performs global checks and allocations to determine if - there are enough huge pages for the operation to succeed. - -2) - a) If the operation can succeed, region_add() is called to actually modify - the reservation map for the same range [f, t) previously passed to - region_chg(). - b) If the operation can not succeed, region_abort is called for the same - range [f, t) to abort the operation. - -Note that this is a two step process where region_add() and region_abort() -are guaranteed to succeed after a prior call to region_chg() for the same -range. region_chg() is responsible for pre-allocating any data structures -necessary to ensure the subsequent operations (specifically region_add())) -will succeed. - -As mentioned above, region_chg() determines the number of pages in the range -which are NOT currently represented in the map. This number is returned to -the caller. region_add() returns the number of pages in the range added to -the map. In most cases, the return value of region_add() is the same as the -return value of region_chg(). However, in the case of shared mappings it is -possible for changes to the reservation map to be made between the calls to -region_chg() and region_add(). In this case, the return value of region_add() -will not match the return value of region_chg(). It is likely that in such -cases global counts and subpool accounting will be incorrect and in need of -adjustment. It is the responsibility of the caller to check for this condition -and make the appropriate adjustments. - -The routine region_del() is called to remove regions from a reservation map. -It is typically called in the following situations: - -- When a file in the hugetlbfs filesystem is being removed, the inode will - be released and the reservation map freed. Before freeing the reservation - map, all the individual file_region structures must be freed. In this case - region_del is passed the range [0, LONG_MAX). -- When a hugetlbfs file is being truncated. In this case, all allocated pages - after the new file size must be freed. In addition, any file_region entries - in the reservation map past the new end of file must be deleted. In this - case, region_del is passed the range [new_end_of_file, LONG_MAX). -- When a hole is being punched in a hugetlbfs file. In this case, huge pages - are removed from the middle of the file one at a time. As the pages are - removed, region_del() is called to remove the corresponding entry from the - reservation map. In this case, region_del is passed the range - [page_idx, page_idx + 1). - -In every case, region_del() will return the number of pages removed from the -reservation map. In VERY rare cases, region_del() can fail. This can only -happen in the hole punch case where it has to split an existing file_region -entry and can not allocate a new structure. In this error case, region_del() -will return -ENOMEM. The problem here is that the reservation map will -indicate that there is a reservation for the page. However, the subpool and -global reservation counts will not reflect the reservation. To handle this -situation, the routine hugetlb_fix_reserve_counts() is called to adjust the -counters so that they correspond with the reservation map entry that could -not be deleted. - -region_count() is called when unmapping a private huge page mapping. In -private mappings, the lack of a entry in the reservation map indicates that -a reservation exists. Therefore, by counting the number of entries in the -reservation map we know how many reservations were consumed and how many are -outstanding (outstanding = (end - start) - region_count(resv, start, end)). -Since the mapping is going away, the subpool and global reservation counts -are decremented by the number of outstanding reservations. - -.. _resv_map_helpers: - -Reservation Map Helper Routines -=============================== - -Several helper routines exist to query and modify the reservation maps. -These routines are only interested with reservations for a specific huge -page, so they just pass in an address instead of a range. In addition, -they pass in the associated VMA. From the VMA, the type of mapping (private -or shared) and the location of the reservation map (inode or VMA) can be -determined. These routines simply call the underlying routines described -in the section "Reservation Map Modifications". However, they do take into -account the 'opposite' meaning of reservation map entries for private and -shared mappings and hide this detail from the caller:: - - long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This routine calls region_chg() for the specified page. If no reservation -exists, 1 is returned. If a reservation exists, 0 is returned:: - - long vma_commit_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_add() for the specified page. As in the case of region_chg -and region_add, this routine is to be called after a previous call to -vma_needs_reservation. It will add a reservation entry for the page. It -returns 1 if the reservation was added and 0 if not. The return value should -be compared with the return value of the previous call to -vma_needs_reservation. An unexpected difference indicates the reservation -map was modified between calls:: - - void vma_end_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This calls region_abort() for the specified page. As in the case of region_chg -and region_abort, this routine is to be called after a previous call to -vma_needs_reservation. It will abort/end the in progress reservation add -operation:: - - long vma_add_reservation(struct hstate *h, - struct vm_area_struct *vma, - unsigned long addr) - -This is a special wrapper routine to help facilitate reservation cleanup -on error paths. It is only called from the routine restore_reserve_on_error(). -This routine is used in conjunction with vma_needs_reservation in an attempt -to add a reservation to the reservation map. It takes into account the -different reservation map semantics for private and shared mappings. Hence, -region_add is called for shared mappings (as an entry present in the map -indicates a reservation), and region_del is called for private mappings (as -the absence of an entry in the map indicates a reservation). See the section -"Reservation cleanup in error paths" for more information on what needs to -be done on error paths. - - -Reservation Cleanup in Error Paths -================================== - -As mentioned in the section -:ref:`Reservation Map Helper Routines `, reservation -map modifications are performed in two steps. First vma_needs_reservation -is called before a page is allocated. If the allocation is successful, -then vma_commit_reservation is called. If not, vma_end_reservation is called. -Global and subpool reservation counts are adjusted based on success or failure -of the operation and all is well. - -Additionally, after a huge page is instantiated the PagePrivate flag is -cleared so that accounting when the page is ultimately freed is correct. - -However, there are several instances where errors are encountered after a huge -page is allocated but before it is instantiated. In this case, the page -allocation has consumed the reservation and made the appropriate subpool, -reservation map and global count adjustments. If the page is freed at this -time (before instantiation and clearing of PagePrivate), then free_huge_page -will increment the global reservation count. However, the reservation map -indicates the reservation was consumed. This resulting inconsistent state -will cause the 'leak' of a reserved huge page. The global reserve count will -be higher than it should and prevent allocation of a pre-allocated page. - -The routine restore_reserve_on_error() attempts to handle this situation. It -is fairly well documented. The intention of this routine is to restore -the reservation map to the way it was before the page allocation. In this -way, the state of the reservation map will correspond to the global reservation -count after the page is freed. - -The routine restore_reserve_on_error itself may encounter errors while -attempting to restore the reservation map entry. In this case, it will -simply clear the PagePrivate flag of the page. In this way, the global -reserve count will not be incremented when the page is freed. However, the -reservation map will continue to look as though the reservation was consumed. -A page can still be allocated for the address, but it will not use a reserved -page as originally intended. - -There is some code (most notably userfaultfd) which can not call -restore_reserve_on_error. In this case, it simply modifies the PagePrivate -so that a reservation will not be leaked when the huge page is freed. - - -Reservations and Memory Policy -============================== -Per-node huge page lists existed in struct hstate when git was first used -to manage Linux code. The concept of reservations was added some time later. -When reservations were added, no attempt was made to take memory policy -into account. While cpusets are not exactly the same as memory policy, this -comment in hugetlb_acct_memory sums up the interaction between reservations -and cpusets/memory policy:: - - /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. - */ - -Huge page reservations were added to prevent unexpected page allocation -failures (OOM) at page fault time. However, if an application makes use -of cpusets or memory policy there is no guarantee that huge pages will be -available on the required nodes. This is true even if there are a sufficient -number of global reservations. - -Hugetlbfs regression testing -============================ - -The most complete set of hugetlb tests are in the libhugetlbfs repository. -If you modify any hugetlb related code, use the libhugetlbfs test suite -to check for regressions. In addition, if you add any new hugetlb -functionality, please add appropriate tests to libhugetlbfs. - --- -Mike Kravetz, 7 April 2017 diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst deleted file mode 100644 index b9d5253c1305..000000000000 --- a/Documentation/vm/hwpoison.rst +++ /dev/null @@ -1,184 +0,0 @@ -.. hwpoison: - -======== -hwpoison -======== - -What is hwpoison? -================= - -Upcoming Intel CPUs have support for recovering from some memory errors -(``MCA recovery``). This requires the OS to declare a page "poisoned", -kill the processes associated with it and avoid using it in the future. - -This patchkit implements the necessary infrastructure in the VM. - -To quote the overview comment:: - - High level machine check handler. Handles pages reported by the - hardware as being corrupted usually due to a 2bit ECC memory or cache - failure. - - This focusses on pages detected as corrupted in the background. - When the current CPU tries to consume corruption the currently - running process can just be killed directly instead. This implies - that if the error cannot be handled for some reason it's safe to - just ignore it because no corruption has been consumed yet. Instead - when that happens another machine check will happen. - - Handles page cache pages in various states. The tricky part - here is that we can access any page asynchronous to other VM - users, because memory failures could happen anytime and anywhere, - possibly violating some of their assumptions. This is why this code - has to be extremely careful. Generally it tries to use normal locking - rules, as in get the standard locks, even if that means the - error handling takes potentially a long time. - - Some of the operations here are somewhat inefficient and have non - linear algorithmic complexity, because the data structures have not - been optimized for this case. This is in particular the case - for the mapping from a vma to a process. Since this case is expected - to be rare we hope we can get away with this. - -The code consists of a the high level handler in mm/memory-failure.c, -a new page poison bit and various checks in the VM to handle poisoned -pages. - -The main target right now is KVM guests, but it works for all kinds -of applications. KVM support requires a recent qemu-kvm release. - -For the KVM use there was need for a new signal type so that -KVM can inject the machine check into the guest with the proper -address. This in theory allows other applications to handle -memory failures too. The expection is that near all applications -won't do that, but some very specialized ones might. - -Failure recovery modes -====================== - -There are two (actually three) modes memory failure recovery can be in: - -vm.memory_failure_recovery sysctl set to zero: - All memory failures cause a panic. Do not attempt recovery. - -early kill - (can be controlled globally and per process) - Send SIGBUS to the application as soon as the error is detected - This allows applications who can process memory errors in a gentle - way (e.g. drop affected object) - This is the mode used by KVM qemu. - -late kill - Send SIGBUS when the application runs into the corrupted page. - This is best for memory error unaware applications and default - Note some pages are always handled as late kill. - -User control -============ - -vm.memory_failure_recovery - See sysctl.txt - -vm.memory_failure_early_kill - Enable early kill mode globally - -PR_MCE_KILL - Set early/late kill mode/revert to system default - - arg1: PR_MCE_KILL_CLEAR: - Revert to system default - arg1: PR_MCE_KILL_SET: - arg2 defines thread specific mode - - PR_MCE_KILL_EARLY: - Early kill - PR_MCE_KILL_LATE: - Late kill - PR_MCE_KILL_DEFAULT - Use system global default - - Note that if you want to have a dedicated thread which handles - the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should - call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, - the SIGBUS is sent to the main thread. - -PR_MCE_KILL_GET - return current mode - -Testing -======= - -* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the - process for testing - -* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/`` - - corrupt-pfn - Inject hwpoison fault at PFN echoed into this file. This does - some early filtering to avoid corrupted unintended pages in test suites. - - unpoison-pfn - Software-unpoison page at PFN echoed into this file. This way - a page can be reused again. This only works for Linux - injected failures, not for real memory failures. Once any hardware - memory failure happens, this feature is disabled. - - Note these injection interfaces are not stable and might change between - kernel versions - - corrupt-filter-dev-major, corrupt-filter-dev-minor - Only handle memory failures to pages associated with the file - system defined by block device major/minor. -1U is the - wildcard value. This should be only used for testing with - artificial injection. - - corrupt-filter-memcg - Limit injection to pages owned by memgroup. Specified by inode - number of the memcg. - - Example:: - - mkdir /sys/fs/cgroup/mem/hwpoison - - usemem -m 100 -s 1000 & - echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks - - memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ') - echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg - - page-types -p `pidof init` --hwpoison # shall do nothing - page-types -p `pidof usemem` --hwpoison # poison its pages - - corrupt-filter-flags-mask, corrupt-filter-flags-value - When specified, only poison pages if ((page_flags & mask) == - value). This allows stress testing of many kinds of - pages. The page_flags are the same as in /proc/kpageflags. The - flag bits are defined in include/linux/kernel-page-flags.h and - documented in Documentation/admin-guide/mm/pagemap.rst - -* Architecture specific MCE injector - - x86 has mce-inject, mce-test - - Some portable hwpoison test programs in mce-test, see below. - -References -========== - -http://halobates.de/mce-lc09-2.pdf - Overview presentation from LinuxCon 09 - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git - Test suite (hwpoison specific portable tests in tsrc) - -git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git - x86 specific injector - - -Limitations -=========== -- Not all page types are supported and never will. Most kernel internal - objects cannot be recovered, only LRU pages for now. - ---- -Andi Kleen, Oct 2009 diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst deleted file mode 100644 index 575ccd40e30c..000000000000 --- a/Documentation/vm/index.rst +++ /dev/null @@ -1,68 +0,0 @@ -===================================== -Linux Memory Management Documentation -===================================== - -Memory Management Guide -======================= - -This is a guide to understanding the memory management subsystem -of Linux. If you are looking for advice on simply allocating memory, -see the :ref:`memory_allocation`. For controlling and tuning guides, -see the :doc:`admin guide <../admin-guide/mm/index>`. - -.. toctree:: - :maxdepth: 1 - - physical_memory - page_tables - process_addrs - bootmem - page_allocation - vmalloc - slab - highmem - page_reclaim - swap - page_cache - shmfs - oom - -Legacy Documentation -==================== - -This is a collection of older documents about the Linux memory management -(MM) subsystem internals with different level of details ranging from -notes and mailing list responses for elaborating descriptions of data -structures and algorithms. It should all be integrated nicely into the -above structured documentation, or deleted if it has served its purpose. - -.. toctree:: - :maxdepth: 1 - - active_mm - arch_pgtable_helpers - balance - damon/index - free_page_reporting - frontswap - hmm - hwpoison - hugetlbfs_reserv - ksm - memory-model - mmu_notifier - numa - overcommit-accounting - page_migration - page_frags - page_owner - page_table_check - remap_file_pages - slub - split_page_table_lock - transhuge - unevictable-lru - vmalloced-kernel-stacks - vmemmap_dedup - z3fold - zsmalloc diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst deleted file mode 100644 index 9e37add068e6..000000000000 --- a/Documentation/vm/ksm.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. _ksm: - -======================= -Kernel Samepage Merging -======================= - -KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, -added to the Linux kernel in 2.6.32. See ``mm/ksm.c`` for its implementation, -and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/ - -The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst ` - -Design -====== - -Overview --------- - -.. kernel-doc:: mm/ksm.c - :DOC: Overview - -Reverse mapping ---------------- -KSM maintains reverse mapping information for KSM pages in the stable -tree. - -If a KSM page is shared between less than ``max_page_sharing`` VMAs, -the node of the stable tree that represents such KSM page points to a -list of struct rmap_item and the ``page->mapping`` of the -KSM page points to the stable tree node. - -When the sharing passes this threshold, KSM adds a second dimension to -the stable tree. The tree node becomes a "chain" that links one or -more "dups". Each "dup" keeps reverse mapping information for a KSM -page with ``page->mapping`` pointing to that "dup". - -Every "chain" and all "dups" linked into a "chain" enforce the -invariant that they represent the same write protected memory content, -even if each "dup" will be pointed by a different KSM page copy of -that content. - -This way the stable tree lookup computational complexity is unaffected -if compared to an unlimited list of reverse mappings. It is still -enforced that there cannot be KSM page content duplicates in the -stable tree itself. - -The deduplication limit enforced by ``max_page_sharing`` is required -to avoid the virtual memory rmap lists to grow too large. The rmap -walk has O(N) complexity where N is the number of rmap_items -(i.e. virtual mappings) that are sharing the page, which is in turn -capped by ``max_page_sharing``. So this effectively spreads the linear -O(N) computational complexity from rmap walk context over different -KSM pages. The ksmd walk over the stable_node "chains" is also O(N), -but N is the number of stable_node "dups", not the number of -rmap_items, so it has not a significant impact on ksmd performance. In -practice the best stable_node "dup" candidate will be kept and found -at the head of the "dups" list. - -High values of ``max_page_sharing`` result in faster memory merging -(because there will be fewer stable_node dups queued into the -stable_node chain->hlist to check for pruning) and higher -deduplication factor at the expense of slower worst case for rmap -walks for any KSM page which can happen during swapping, compaction, -NUMA balancing and page migration. - -The ``stable_node_dups/stable_node_chains`` ratio is also affected by the -``max_page_sharing`` tunable, and an high ratio may indicate fragmentation -in the stable_node dups, which could be solved by introducing -fragmentation algorithms in ksmd which would refile rmap_items from -one stable_node dup to another stable_node dup, in order to free up -stable_node "dups" with few rmap_items in them, but that may increase -the ksmd CPU usage and possibly slowdown the readonly computations on -the KSM pages of the applications. - -The whole list of stable_node "dups" linked in the stable_node -"chains" is scanned periodically in order to prune stale stable_nodes. -The frequency of such scans is defined by -``stable_node_chains_prune_millisecs`` sysfs tunable. - -Reference ---------- -.. kernel-doc:: mm/ksm.c - :functions: mm_slot ksm_scan stable_node rmap_item - --- -Izik Eidus, -Hugh Dickins, 17 Nov 2009 diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst deleted file mode 100644 index 30e8fbed6914..000000000000 --- a/Documentation/vm/memory-model.rst +++ /dev/null @@ -1,177 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _physical_memory_model: - -===================== -Physical Memory Model -===================== - -Physical memory in a system may be addressed in different ways. The -simplest case is when the physical memory starts at address 0 and -spans a contiguous range up to the maximal address. It could be, -however, that this range contains small holes that are not accessible -for the CPU. Then there could be several contiguous ranges at -completely distinct addresses. And, don't forget about NUMA, where -different memory banks are attached to different CPUs. - -Linux abstracts this diversity using one of the two memory models: -FLATMEM and SPARSEMEM. Each architecture defines what -memory models it supports, what the default memory model is and -whether it is possible to manually override that default. - -All the memory models track the status of physical page frames using -struct page arranged in one or more arrays. - -Regardless of the selected memory model, there exists one-to-one -mapping between the physical page frame number (PFN) and the -corresponding `struct page`. - -Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn` -helpers that allow the conversion from PFN to `struct page` and vice -versa. - -FLATMEM -======= - -The simplest memory model is FLATMEM. This model is suitable for -non-NUMA systems with contiguous, or mostly contiguous, physical -memory. - -In the FLATMEM memory model, there is a global `mem_map` array that -maps the entire physical memory. For most architectures, the holes -have entries in the `mem_map` array. The `struct page` objects -corresponding to the holes are never fully initialized. - -To allocate the `mem_map` array, architecture specific setup code should -call :c:func:`free_area_init` function. Yet, the mappings array is not -usable until the call to :c:func:`memblock_free_all` that hands all the -memory to the page allocator. - -An architecture may free parts of the `mem_map` array that do not cover the -actual physical pages. In such case, the architecture specific -:c:func:`pfn_valid` implementation should take the holes in the -`mem_map` into account. - -With FLATMEM, the conversion between a PFN and the `struct page` is -straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the -`mem_map` array. - -The `ARCH_PFN_OFFSET` defines the first page frame number for -systems with physical memory starting at address different from 0. - -SPARSEMEM -========= - -SPARSEMEM is the most versatile memory model available in Linux and it -is the only memory model that supports several advanced features such -as hot-plug and hot-remove of the physical memory, alternative memory -maps for non-volatile memory devices and deferred initialization of -the memory map for larger systems. - -The SPARSEMEM model presents the physical memory as a collection of -sections. A section is represented with struct mem_section -that contains `section_mem_map` that is, logically, a pointer to an -array of struct pages. However, it is stored with some other magic -that aids the sections management. The section size and maximal number -of section is specified using `SECTION_SIZE_BITS` and -`MAX_PHYSMEM_BITS` constants defined by each architecture that -supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a -physical address that an architecture supports, the -`SECTION_SIZE_BITS` is an arbitrary value. - -The maximal number of sections is denoted `NR_MEM_SECTIONS` and -defined as - -.. math:: - - NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)} - -The `mem_section` objects are arranged in a two-dimensional array -called `mem_sections`. The size and placement of this array depend -on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of -sections: - -* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections` - array is static and has `NR_MEM_SECTIONS` rows. Each row holds a - single `mem_section` object. -* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections` - array is dynamically allocated. Each row contains PAGE_SIZE worth of - `mem_section` objects and the number of rows is calculated to fit - all the memory sections. - -The architecture setup code should call sparse_init() to -initialize the memory sections and the memory maps. - -With SPARSEMEM there are two possible ways to convert a PFN to the -corresponding `struct page` - a "classic sparse" and "sparse -vmemmap". The selection is made at build time and it is determined by -the value of `CONFIG_SPARSEMEM_VMEMMAP`. - -The classic sparse encodes the section number of a page in page->flags -and uses high bits of a PFN to access the section that maps that page -frame. Inside a section, the PFN is the index to the array of pages. - -The sparse vmemmap uses a virtually mapped memory map to optimize -pfn_to_page and page_to_pfn operations. There is a global `struct -page *vmemmap` pointer that points to a virtually contiguous array of -`struct page` objects. A PFN is an index to that array and the -offset of the `struct page` from `vmemmap` is the PFN of that -page. - -To use vmemmap, an architecture has to reserve a range of virtual -addresses that will map the physical pages containing the memory -map and make sure that `vmemmap` points to that range. In addition, -the architecture should implement :c:func:`vmemmap_populate` method -that will allocate the physical memory and create page tables for the -virtual memory map. If an architecture does not have any special -requirements for the vmemmap mappings, it can use default -:c:func:`vmemmap_populate_basepages` provided by the generic memory -management. - -The virtually mapped memory map allows storing `struct page` objects -for persistent memory devices in pre-allocated storage on those -devices. This storage is represented with struct vmem_altmap -that is eventually passed to vmemmap_populate() through a long chain -of function calls. The vmemmap_populate() implementation may use the -`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to -allocate memory map on the persistent memory device. - -ZONE_DEVICE -=========== -The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer -`struct page` `mem_map` services for device driver identified physical -address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact -that the page objects for these address ranges are never marked online, -and that a reference must be taken against the device, not just the page -to keep the memory pinned for active use. `ZONE_DEVICE`, via -:c:func:`devm_memremap_pages`, performs just enough memory hotplug to -turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and -:c:func:`get_user_pages` service for the given range of pfns. Since the -page reference count never drops below 1 the page is never tracked as -free memory and the page's `struct list_head lru` space is repurposed -for back referencing to the host device / driver that mapped the memory. - -While `SPARSEMEM` presents memory as a collection of sections, -optionally collected into memory blocks, `ZONE_DEVICE` users have a need -for smaller granularity of populating the `mem_map`. Given that -`ZONE_DEVICE` memory is never marked online it is subsequently never -subject to its memory ranges being exposed through the sysfs memory -hotplug api on memory block boundaries. The implementation relies on -this lack of user-api constraint to allow sub-section sized memory -ranges to be specified to :c:func:`arch_add_memory`, the top-half of -memory hotplug. Sub-section support allows for 2MB as the cross-arch -common alignment granularity for :c:func:`devm_memremap_pages`. - -The users of `ZONE_DEVICE` are: - -* pmem: Map platform persistent memory to be used as a direct-I/O target - via DAX mappings. - -* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` - event callbacks to allow a device-driver to coordinate memory management - events related to device-memory, typically GPU memory. See - Documentation/vm/hmm.rst. - -* p2pdma: Create `struct page` objects to allow peer devices in a - PCI/-E topology to coordinate direct-DMA operations between themselves, - i.e. bypass host memory. diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst deleted file mode 100644 index df5d7777fc6b..000000000000 --- a/Documentation/vm/mmu_notifier.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _mmu_notifier: - -When do you need to notify inside page table lock ? -=================================================== - -When clearing a pte/pmd we are given a choice to notify the event through -(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under -the page table lock. But that notification is not necessary in all cases. - -For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use -thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a -process virtual address space). There is only 2 cases when you need to notify -those secondary TLB while holding page table lock when clearing a pte/pmd: - - A) page backing address is free before mmu_notifier_invalidate_range_end() - B) a page table entry is updated to point to a new page (COW, write fault - on zero page, __replace_page(), ...) - -Case A is obvious you do not want to take the risk for the device to write to -a page that might now be used by some completely different task. - -Case B is more subtle. For correctness it requires the following sequence to -happen: - - - take page table lock - - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) - - set page table entry to point to new page - -If clearing the page table entry is not followed by a notify before setting -the new pte/pmd value then you can break memory model like C11 or C++11 for -the device. - -Consider the following scenario (device use a feature similar to ATS/PASID): - -Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume -they are write protected for COW (other case of B apply too). - -:: - - [Time N] -------------------------------------------------------------------- - CPU-thread-0 {try to write to addrA} - CPU-thread-1 {try to write to addrB} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA and populate device TLB} - DEV-thread-2 {read addrB and populate device TLB} - [Time N+1] ------------------------------------------------------------------ - CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} - CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+2] ------------------------------------------------------------------ - CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} - CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {write to addrA which is a write to new page} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+3] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {preempted} - CPU-thread-2 {} - CPU-thread-3 {write to addrB which is a write to new page} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+4] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {} - DEV-thread-2 {} - [Time N+5] ------------------------------------------------------------------ - CPU-thread-0 {preempted} - CPU-thread-1 {} - CPU-thread-2 {} - CPU-thread-3 {} - DEV-thread-0 {read addrA from old page} - DEV-thread-2 {read addrB from new page} - -So here because at time N+2 the clear page table entry was not pair with a -notification to invalidate the secondary TLB, the device see the new value for -addrB before seeing the new value for addrA. This break total memory ordering -for the device. - -When changing a pte to write protect or to point to a new write protected page -with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range -call to mmu_notifier_invalidate_range_end() outside the page table lock. This -is true even if the thread doing the page table update is preempted right after -releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst deleted file mode 100644 index 99fdeca917ca..000000000000 --- a/Documentation/vm/numa.rst +++ /dev/null @@ -1,150 +0,0 @@ -.. _numa: - -Started Nov 1999 by Kanoj Sarcar - -============= -What is NUMA? -============= - -This question can be answered from a couple of perspectives: the -hardware view and the Linux software view. - -From the hardware perspective, a NUMA system is a computer platform that -comprises multiple components or assemblies each of which may contain 0 -or more CPUs, local memory, and/or IO buses. For brevity and to -disambiguate the hardware view of these physical components/assemblies -from the software abstraction thereof, we'll call the components/assemblies -'cells' in this document. - -Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset -of the system--although some components necessary for a stand-alone SMP system -may not be populated on any given cell. The cells of the NUMA system are -connected together with some sort of system interconnect--e.g., a crossbar or -point-to-point link are common types of NUMA system interconnects. Both of -these types of interconnects can be aggregated to create NUMA platforms with -cells at multiple distances from other cells. - -For Linux, the NUMA platforms of interest are primarily what is known as Cache -Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible -to and accessible from any CPU attached to any cell and cache coherency -is handled in hardware by the processor caches and/or the system interconnect. - -Memory access time and effective memory bandwidth varies depending on how far -away the cell containing the CPU or IO bus making the memory access is from the -cell containing the target memory. For example, access to memory by CPUs -attached to the same cell will experience faster access times and higher -bandwidths than accesses to memory on other, remote cells. NUMA platforms -can have cells at multiple remote distances from any given cell. - -Platform vendors don't build NUMA systems just to make software developers' -lives interesting. Rather, this architecture is a means to provide scalable -memory bandwidth. However, to achieve scalable memory bandwidth, system and -application software must arrange for a large majority of the memory references -[cache misses] to be to "local" memory--memory on the same cell, if any--or -to the closest cell with memory. - -This leads to the Linux software view of a NUMA system: - -Linux divides the system's hardware resources into multiple software -abstractions called "nodes". Linux maps the nodes onto the physical cells -of the hardware platform, abstracting away some of the details for some -architectures. As with physical cells, software nodes may contain 0 or more -CPUs, memory and/or IO buses. And, again, memory accesses to memory on -"closer" nodes--nodes that map to closer cells--will generally experience -faster access times and higher effective bandwidth than accesses to more -remote cells. - -For some architectures, such as x86, Linux will "hide" any node representing a -physical cell that has no memory attached, and reassign any CPUs attached to -that cell to a node representing a cell that does have memory. Thus, on -these architectures, one cannot assume that all CPUs that Linux associates with -a given node will see the same local memory access times and bandwidth. - -In addition, for some architectures, again x86 is an example, Linux supports -the emulation of additional nodes. For NUMA emulation, linux will carve up -the existing nodes--or the system memory for non-NUMA platforms--into multiple -nodes. Each emulated node will manage a fraction of the underlying cells' -physical memory. NUMA emluation is useful for testing NUMA kernel and -application features on non-NUMA platforms, and as a sort of memory resource -management mechanism when used together with cpusets. -[see Documentation/admin-guide/cgroup-v1/cpusets.rst] - -For each node with memory, Linux constructs an independent memory management -subsystem, complete with its own free page lists, in-use page lists, usage -statistics and locks to mediate access. In addition, Linux constructs for -each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], -an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a -selected zone/node cannot satisfy the allocation request. This situation, -when a zone has no available memory to satisfy a request, is called -"overflow" or "fallback". - -Because some nodes contain multiple zones containing different types of -memory, Linux must decide whether to order the zonelists such that allocations -fall back to the same zone type on a different node, or to a different zone -type on the same node. This is an important consideration because some zones, -such as DMA or DMA32, represent relatively scarce resources. Linux chooses -a default Node ordered zonelist. This means it tries to fallback to other zones -from the same node before using remote nodes which are ordered by NUMA distance. - -By default, Linux will attempt to satisfy memory allocation requests from the -node to which the CPU that executes the request is assigned. Specifically, -Linux will attempt to allocate from the first node in the appropriate zonelist -for the node where the request originates. This is called "local allocation." -If the "local" node cannot satisfy the request, the kernel will examine other -nodes' zones in the selected zonelist looking for the first zone in the list -that can satisfy the request. - -Local allocation will tend to keep subsequent access to the allocated memory -"local" to the underlying physical resources and off the system interconnect-- -as long as the task on whose behalf the kernel allocated some memory does not -later migrate away from that memory. The Linux scheduler is aware of the -NUMA topology of the platform--embodied in the "scheduling domains" data -structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler -attempts to minimize task migration to distant scheduling domains. However, -the scheduler does not take a task's NUMA footprint into account directly. -Thus, under sufficient imbalance, tasks can migrate between nodes, remote -from their initial node and kernel data structures. - -System administrators and application designers can restrict a task's migration -to improve NUMA locality using various CPU affinity command line interfaces, -such as taskset(1) and numactl(1), and program interfaces such as -sched_setaffinity(2). Further, one can modify the kernel's default local -allocation behavior using Linux NUMA memory policy. [see -:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `]. - -System administrators can restrict the CPUs and nodes' memories that a non- -privileged user can specify in the scheduling or NUMA commands and functions -using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst] - -On architectures that do not hide memoryless nodes, Linux will include only -zones [nodes] with memory in the zonelists. This means that for a memoryless -node the "local memory node"--the node of the first zone in CPU's node's -zonelist--will not be the node itself. Rather, it will be the node that the -kernel selected as the nearest node with memory when it built the zonelists. -So, default, local allocations will succeed with the kernel supplying the -closest available memory. This is a consequence of the same mechanism that -allows such allocations to fallback to other nearby nodes when a node that -does contain memory overflows. - -Some kernel allocations do not want or cannot tolerate this allocation fallback -behavior. Rather they want to be sure they get memory from the specified node -or get notified that the node has no free memory. This is usually the case when -a subsystem allocates per CPU memory resources, for example. - -A typical model for making such an allocation is to obtain the node id of the -node to which the "current CPU" is attached using one of the kernel's -numa_node_id() or CPU_to_node() functions and then request memory from only -the node id returned. When such an allocation fails, the requesting subsystem -may revert to its own fallback path. The slab kernel memory allocator is an -example of this. Or, the subsystem may choose to disable or not to enable -itself on allocation failure. The kernel profiling subsystem is an example of -this. - -If the architecture supports--does not hide--memoryless nodes, then CPUs -attached to memoryless nodes would always incur the fallback path overhead -or some subsystems would fail to initialize if they attempted to allocated -memory exclusively from a node without memory. To support such -architectures transparently, kernel subsystems can use the numa_mem_id() -or cpu_to_mem() function to locate the "local memory node" for the calling or -specified CPU. Again, this is the same node from which default, local page -allocations will be attempted. diff --git a/Documentation/vm/oom.rst b/Documentation/vm/oom.rst deleted file mode 100644 index 18e9e40c1ec1..000000000000 --- a/Documentation/vm/oom.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -Out Of Memory Handling -====================== diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst deleted file mode 100644 index 1addb0c374a4..000000000000 --- a/Documentation/vm/overcommit-accounting.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _overcommit_accounting: - -===================== -Overcommit Accounting -===================== - -The Linux kernel supports the following overcommit handling modes - -0 - Heuristic overcommit handling. Obvious overcommits of address - space are refused. Used for a typical system. It ensures a - seriously wild allocation fails while allowing overcommit to - reduce swap usage. root is allowed to allocate slightly more - memory in this mode. This is the default. - -1 - Always overcommit. Appropriate for some scientific - applications. Classic example is code using sparse arrays and - just relying on the virtual memory consisting almost entirely - of zero pages. - -2 - Don't overcommit. The total address space commit for the - system is not permitted to exceed swap + a configurable amount - (default is 50%) of physical RAM. Depending on the amount you - use, in most situations this means a process will not be - killed while accessing pages but will receive errors on memory - allocation as appropriate. - - Useful for applications that want to guarantee their memory - allocations will be available in the future without having to - initialize every page. - -The overcommit policy is set via the sysctl ``vm.overcommit_memory``. - -The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage) -or ``vm.overcommit_kbytes`` (absolute value). These only have an effect -when ``vm.overcommit_memory`` is set to 2. - -The current overcommit limit and amount committed are viewable in -``/proc/meminfo`` as CommitLimit and Committed_AS respectively. - -Gotchas -======= - -The C language stack growth does an implicit mremap. If you want absolute -guarantees and run close to the edge you MUST mmap your stack for the -largest size you think you will need. For typical stack usage this does -not matter much but it's a corner case if you really really care - -In mode 2 the MAP_NORESERVE flag is ignored. - - -How It Works -============ - -The overcommit is based on the following rules - -For a file backed map - | SHARED or READ-only - 0 cost (the file is the map not swap) - | PRIVATE WRITABLE - size of mapping per instance - -For an anonymous or ``/dev/zero`` map - | SHARED - size of mapping - | PRIVATE READ-only - 0 cost (but of little use) - | PRIVATE WRITABLE - size of mapping per instance - -Additional accounting - | Pages made writable copies by mmap - | shmfs memory drawn from the same pool - -Status -====== - -* We account mmap memory mappings -* We account mprotect changes in commit -* We account mremap changes in size -* We account brk -* We account munmap -* We report the commit status in /proc -* Account and check on fork -* Review stack handling/building on exec -* SHMfs accounting -* Implement actual limit enforcement - -To Do -===== -* Account ptrace pages (this is hard) diff --git a/Documentation/vm/page_allocation.rst b/Documentation/vm/page_allocation.rst deleted file mode 100644 index d9b4495561f1..000000000000 --- a/Documentation/vm/page_allocation.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Page Allocation -=============== diff --git a/Documentation/vm/page_cache.rst b/Documentation/vm/page_cache.rst deleted file mode 100644 index 75eba7c431b2..000000000000 --- a/Documentation/vm/page_cache.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========== -Page Cache -========== diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst deleted file mode 100644 index 7d6f9385d129..000000000000 --- a/Documentation/vm/page_frags.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. _page_frags: - -============== -Page fragments -============== - -A page fragment is an arbitrary-length arbitrary-offset area of memory -which resides within a 0 or higher order compound page. Multiple -fragments within that page are individually refcounted, in the page's -reference counter. - -The page_frag functions, page_frag_alloc and page_frag_free, provide a -simple allocation framework for page fragments. This is used by the -network stack and network device drivers to provide a backing region of -memory for use as either an sk_buff->head, or to be used in the "frags" -portion of skb_shared_info. - -In order to make use of the page fragment APIs a backing page fragment -cache is needed. This provides a central point for the fragment allocation -and tracks allows multiple calls to make use of a cached page. The -advantage to doing this is that multiple calls to get_page can be avoided -which can be expensive at allocation time. However due to the nature of -this caching it is required that any calls to the cache be protected by -either a per-cpu limitation, or a per-cpu limitation and forcing interrupts -to be disabled when executing the fragment allocation. - -The network stack uses two separate caches per CPU to handle fragment -allocation. The netdev_alloc_cache is used by callers making use of the -netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is -used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The -main difference between these two calls is the context in which they may be -called. The "netdev" prefixed functions are usable in any context as these -functions will disable interrupts, while the "napi" prefixed functions are -only usable within the softirq context. - -Many network device drivers use a similar methodology for allocating page -fragments, but the page fragments are cached at the ring or descriptor -level. In order to enable these cases it is necessary to provide a generic -way of tearing down a page cache. For this reason __page_frag_cache_drain -was implemented. It allows for freeing multiple references from a single -page via a single call. The advantage to doing this is that it allows for -cleaning up the multiple references that were added to a page in order to -avoid calling get_page per allocation. - -Alexander Duyck, Nov 29, 2016. diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst deleted file mode 100644 index 8c5cb8147e55..000000000000 --- a/Documentation/vm/page_migration.rst +++ /dev/null @@ -1,288 +0,0 @@ -.. _page_migration: - -============== -Page migration -============== - -Page migration allows moving the physical location of pages between -nodes in a NUMA system while the process is running. This means that the -virtual addresses that the process sees do not change. However, the -system rearranges the physical location of those pages. - -Also see :ref:`Heterogeneous Memory Management (HMM) ` -for migrating pages to or from device private memory. - -The main intent of page migration is to reduce the latency of memory accesses -by moving pages near to the processor where the process accessing that memory -is running. - -Page migration allows a process to manually relocate the node on which its -pages are located through the MF_MOVE and MF_MOVE_ALL options while setting -a new memory policy via mbind(). The pages of a process can also be relocated -from another process using the sys_migrate_pages() function call. The -migrate_pages() function call takes two sets of nodes and moves pages of a -process that are located on the from nodes to the destination nodes. -Page migration functions are provided by the numactl package by Andi Kleen -(a version later than 0.9.3 is required. Get it from -https://github.com/numactl/numactl.git). numactl provides libnuma -which provides an interface similar to other NUMA functionality for page -migration. cat ``/proc//numa_maps`` allows an easy review of where the -pages of a process are located. See also the numa_maps documentation in the -proc(5) man page. - -Manual migration is useful if for example the scheduler has relocated -a process to a processor on a distant node. A batch scheduler or an -administrator may detect the situation and move the pages of the process -nearer to the new processor. The kernel itself only provides -manual page migration support. Automatic page migration may be implemented -through user space processes that move pages. A special function call -"move_pages" allows the moving of individual pages within a process. -For example, A NUMA profiler may obtain a log showing frequent off-node -accesses and may use the result to move pages to more advantageous -locations. - -Larger installations usually partition the system using cpusets into -sections of nodes. Paul Jackson has equipped cpusets with the ability to -move pages when a task is moved to another cpuset (See -:ref:`CPUSETS `). -Cpusets allow the automation of process locality. If a task is moved to -a new cpuset then also all its pages are moved with it so that the -performance of the process does not sink dramatically. Also the pages -of processes in a cpuset are moved if the allowed memory nodes of a -cpuset are changed. - -Page migration allows the preservation of the relative location of pages -within a group of nodes for all migration techniques which will preserve a -particular memory allocation pattern generated even after migrating a -process. This is necessary in order to preserve the memory latencies. -Processes will run with similar performance after migration. - -Page migration occurs in several steps. First a high level -description for those trying to use migrate_pages() from the kernel -(for userspace usage see the Andi Kleen's numactl package mentioned above) -and then a low level description of how the low level details work. - -In kernel use of migrate_pages() -================================ - -1. Remove pages from the LRU. - - Lists of pages to be migrated are generated by scanning over - pages and moving them into lists. This is done by - calling isolate_lru_page(). - Calling isolate_lru_page() increases the references to the page - so that it cannot vanish while the page migration occurs. - It also prevents the swapper or other scans from encountering - the page. - -2. We need to have a function of type new_page_t that can be - passed to migrate_pages(). This function should figure out - how to allocate the correct new page given the old page. - -3. The migrate_pages() function is called which attempts - to do the migration. It will call the function to allocate - the new page for each page that is considered for - moving. - -How migrate_pages() works -========================= - -migrate_pages() does several passes over its list of pages. A page is moved -if all references to a page are removable at the time. The page has -already been removed from the LRU via isolate_lru_page() and the refcount -is increased so that the page cannot be freed while page migration occurs. - -Steps: - -1. Lock the page to be migrated. - -2. Ensure that writeback is complete. - -3. Lock the new page that we want to move to. It is locked so that accesses to - this (not yet up-to-date) page immediately block while the move is in progress. - -4. All the page table references to the page are converted to migration - entries. This decreases the mapcount of a page. If the resulting - mapcount is not zero then we do not migrate the page. All user space - processes that attempt to access the page will now wait on the page lock - or wait for the migration page table entry to be removed. - -5. The i_pages lock is taken. This will cause all processes trying - to access the page via the mapping to block on the spinlock. - -6. The refcount of the page is examined and we back out if references remain. - Otherwise, we know that we are the only one referencing this page. - -7. The radix tree is checked and if it does not contain the pointer to this - page then we back out because someone else modified the radix tree. - -8. The new page is prepped with some settings from the old page so that - accesses to the new page will discover a page with the correct settings. - -9. The radix tree is changed to point to the new page. - -10. The reference count of the old page is dropped because the address space - reference is gone. A reference to the new page is established because - the new page is referenced by the address space. - -11. The i_pages lock is dropped. With that lookups in the mapping - become possible again. Processes will move from spinning on the lock - to sleeping on the locked new page. - -12. The page contents are copied to the new page. - -13. The remaining page flags are copied to the new page. - -14. The old page flags are cleared to indicate that the page does - not provide any information anymore. - -15. Queued up writeback on the new page is triggered. - -16. If migration entries were inserted into the page table, then replace them - with real ptes. Doing so will enable access for user space processes not - already waiting for the page lock. - -17. The page locks are dropped from the old and new page. - Processes waiting on the page lock will redo their page faults - and will reach the new page. - -18. The new page is moved to the LRU and can be scanned by the swapper, - etc. again. - -Non-LRU page migration -====================== - -Although migration originally aimed for reducing the latency of memory accesses -for NUMA, compaction also uses migration to create high-order pages. - -Current problem of the implementation is that it is designed to migrate only -*LRU* pages. However, there are potential non-LRU pages which can be migrated -in drivers, for example, zsmalloc, virtio-balloon pages. - -For virtio-balloon pages, some parts of migration code path have been hooked -up and added virtio-balloon specific functions to intercept migration logics. -It's too specific to a driver so other drivers who want to make their pages -movable would have to add their own specific hooks in the migration path. - -To overcome the problem, VM supports non-LRU page migration which provides -generic functions for non-LRU movable pages without driver specific hooks -in the migration path. - -If a driver wants to make its pages movable, it should define three functions -which are function pointers of struct address_space_operations. - -1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` - - What VM expects from isolate_page() function of driver is to return *true* - if driver isolates the page successfully. On returning true, VM marks the page - as PG_isolated so concurrent isolation in several CPUs skip the page - for isolation. If a driver cannot isolate the page, it should return *false*. - - Once page is successfully isolated, VM uses page.lru fields so driver - shouldn't expect to preserve values in those fields. - -2. ``int (*migratepage) (struct address_space *mapping,`` -| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` - - After isolation, VM calls migratepage() of driver with the isolated page. - The function of migratepage() is to move the contents of the old page to the - new page - and set up fields of struct page newpage. Keep in mind that you should - indicate to the VM the oldpage is no longer movable via __ClearPageMovable() - under page_lock if you migrated the oldpage successfully and returned - MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver - can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time - because VM interprets -EAGAIN as "temporary migration failure". On returning - any error except -EAGAIN, VM will give up the page migration without - retrying. - - Driver shouldn't touch the page.lru field while in the migratepage() function. - -3. ``void (*putback_page)(struct page *);`` - - If migration fails on the isolated page, VM should return the isolated page - to the driver so VM calls the driver's putback_page() with the isolated page. - In this function, the driver should put the isolated page back into its own data - structure. - -Non-LRU movable page flags - - There are two page flags for supporting non-LRU movable page. - - * PG_movable - - Driver should use the function below to make page movable under page_lock:: - - void __SetPageMovable(struct page *page, struct address_space *mapping) - - It needs argument of address_space for registering migration - family functions which will be called by VM. Exactly speaking, - PG_movable is not a real flag of struct page. Rather, VM - reuses the page->mapping's lower bits to represent it:: - - #define PAGE_MAPPING_MOVABLE 0x2 - page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; - - so driver shouldn't access page->mapping directly. Instead, driver should - use page_mapping() which masks off the low two bits of page->mapping under - page lock so it can get the right struct address_space. - - For testing of non-LRU movable pages, VM supports __PageMovable() function. - However, it doesn't guarantee to identify non-LRU movable pages because - the page->mapping field is unified with other variables in struct page. - If the driver releases the page after isolation by VM, page->mapping - doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set - (look at __ClearPageMovable). But __PageMovable() is cheap to call whether - page is LRU or non-LRU movable once the page has been isolated because LRU - pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also - good for just peeking to test non-LRU movable pages before more expensive - checking with lock_page() in pfn scanning to select a victim. - - For guaranteeing non-LRU movable page, VM provides PageMovable() function. - Unlike __PageMovable(), PageMovable() validates page->mapping and - mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents - sudden destroying of page->mapping. - - Drivers using __SetPageMovable() should clear the flag via - __ClearMovablePage() under page_lock() before the releasing the page. - - * PG_isolated - - To prevent concurrent isolation among several CPUs, VM marks isolated page - as PG_isolated under lock_page(). So if a CPU encounters PG_isolated - non-LRU movable page, it can skip it. Driver doesn't need to manipulate the - flag because VM will set/clear it automatically. Keep in mind that if the - driver sees a PG_isolated page, it means the page has been isolated by the - VM so it shouldn't touch the page.lru field. - The PG_isolated flag is aliased with the PG_reclaim flag so drivers - shouldn't use PG_isolated for its own purposes. - -Monitoring Migration -===================== - -The following events (counters) can be used to monitor page migration. - -1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a - page was migrated. If the page was a non-THP and non-hugetlb page, then - this counter is increased by one. If the page was a THP or hugetlb, then - this counter is increased by the number of THP or hugetlb subpages. - For example, migration of a single 2MB THP that has 4KB-size base pages - (subpages) will cause this counter to increase by 512. - -2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for - PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, - if it was a THP or hugetlb. - -3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. - -4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split. - -5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had - to be split. After splitting, a migration retry was used for it's sub-pages. - -THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or -PGMIGRATE_FAIL events. For example, a THP migration failure will cause both -THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. - -Christoph Lameter, May 8, 2006. -Minchan Kim, Mar 28, 2016. diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst deleted file mode 100644 index f5c954afe97c..000000000000 --- a/Documentation/vm/page_owner.rst +++ /dev/null @@ -1,196 +0,0 @@ -.. _page_owner: - -================================================== -page owner: Tracking about who allocated each page -================================================== - -Introduction -============ - -page owner is for the tracking about who allocated each page. -It can be used to debug memory leak or to find a memory hogger. -When allocation happens, information about allocation such as call stack -and order of pages is stored into certain storage for each page. -When we need to know about status of all pages, we can get and analyze -this information. - -Although we already have tracepoint for tracing page allocation/free, -using it for analyzing who allocate each page is rather complex. We need -to enlarge the trace buffer for preventing overlapping until userspace -program launched. And, launched program continually dump out the trace -buffer for later analysis and it would change system behaviour with more -possibility rather than just keeping it in memory, so bad for debugging. - -page owner can also be used for various purposes. For example, accurate -fragmentation statistics can be obtained through gfp flag information of -each page. It is already implemented and activated if page owner is -enabled. Other usages are more than welcome. - -page owner is disabled by default. So, if you'd like to use it, you need -to add "page_owner=on" to your boot cmdline. If the kernel is built -with page owner and page owner is disabled in runtime due to not enabling -boot option, runtime overhead is marginal. If disabled in runtime, it -doesn't require memory to store owner information, so there is no runtime -memory overhead. And, page owner inserts just two unlikely branches into -the page allocator hotpath and if not enabled, then allocation is done -like as the kernel without page owner. These two unlikely branches should -not affect to allocation performance, especially if the static keys jump -label patching functionality is available. Following is the kernel's code -size change due to this facility. - -- Without page owner:: - - text data bss dec hex filename - 48392 2333 644 51369 c8a9 mm/page_alloc.o - -- With page owner:: - - text data bss dec hex filename - 48800 2445 644 51889 cab1 mm/page_alloc.o - 6662 108 29 6799 1a8f mm/page_owner.o - 1025 8 8 1041 411 mm/page_ext.o - -Although, roughly, 8 KB code is added in total, page_alloc.o increase by -520 bytes and less than half of it is in hotpath. Building the kernel with -page owner and turning it on if needed would be great option to debug -kernel memory problem. - -There is one notice that is caused by implementation detail. page owner -stores information into the memory from struct page extension. This memory -is initialized some time later than that page allocator starts in sparse -memory system, so, until initialization, many pages can be allocated and -they would have no owner information. To fix it up, these early allocated -pages are investigated and marked as allocated in initialization phase. -Although it doesn't mean that they have the right owner information, -at least, we can tell whether the page is allocated or not, -more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages -are catched and marked, although they are mostly allocated from struct -page extension feature. Anyway, after that, no page is left in -un-tracking state. - -Usage -===== - -1) Build user-space helper:: - - cd tools/vm - make page_owner_sort - -2) Enable page owner: add "page_owner=on" to boot cmdline. - -3) Do the job that you want to debug. - -4) Analyze information from page owner:: - - cat /sys/kernel/debug/page_owner > page_owner_full.txt - ./page_owner_sort page_owner_full.txt sorted_page_owner.txt - - The general output of ``page_owner_full.txt`` is as follows:: - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - Page allocated via order XXX, ... - PFN XXX ... - // Detailed stack - - The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows - in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the parameter(s). - - See the result about who allocated each page - in the ``sorted_page_owner.txt``. General output:: - - XXX times, XXX pages: - Page allocated via order XXX, ... - // Detailed stack - - By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the page nums of buf, use the ``-m`` parameter. - The detailed parameters are: - - fundamental function:: - - Sort: - -a Sort by memory allocation time. - -m Sort by total memory. - -p Sort by pid. - -P Sort by tgid. - -n Sort by task command name. - -r Sort by memory release time. - -s Sort by stack trace. - -t Sort by times (default). - --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. - Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is - optional since default direction is increasing numerical or lexicographic - order. Mixed use of abbreviated and complete-form of keys is allowed. - - Examples: - ./page_owner_sort --sort=n,+pid,-tgid - ./page_owner_sort --sort=at - - additional function:: - - Cull: - --cull - Specify culling rules.Culling syntax is key[,key[,...]].Choose a - multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. - - is a single argument in the form of a comma-separated list, - which offers a way to specify individual culling rules. The recognized - keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. - can be specified by the sequence of keys k1,k2, ..., as described in - the STANDARD SORT KEYS section below. Mixed use of abbreviated and - complete-form of keys is allowed. - - Examples: - ./page_owner_sort --cull=stacktrace - ./page_owner_sort --cull=st,pid,name - ./page_owner_sort --cull=n,f - - Filter: - -f Filter out the information of blocks whose memory has been released. - - Select: - --pid Select by pid. This selects the blocks whose process ID - numbers appear in . - --tgid Select by tgid. This selects the blocks whose thread - group ID numbers appear in . - --name Select by task command name. This selects the blocks whose - task command name appear in . - - , , are single arguments in the form of a comma-separated list, - which offers a way to specify individual selecting rules. - - - Examples: - ./page_owner_sort --pid=1 - ./page_owner_sort --tgid=1,2,3 - ./page_owner_sort --name name1,name2 - -STANDARD FORMAT SPECIFIERS -========================== -:: - - For --sort option: - - KEY LONG DESCRIPTION - p pid process ID - tg tgid thread group ID - n name task command name - st stacktrace stack trace of the page allocation - T txt full text of block - ft free_ts timestamp of the page when it was released - at alloc_ts timestamp of the page when it was allocated - ator allocator memory allocator for pages - - For --curl option: - - KEY LONG DESCRIPTION - p pid process ID - tg tgid thread group ID - n name task command name - f free whether the page has been released or not - st stacktrace stack trace of the page allocation - ator allocator memory allocator for pages diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/vm/page_reclaim.rst deleted file mode 100644 index 50a30b7f8ac3..000000000000 --- a/Documentation/vm/page_reclaim.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============ -Page Reclaim -============ diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst deleted file mode 100644 index 1a09472f10a3..000000000000 --- a/Documentation/vm/page_table_check.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _page_table_check: - -================ -Page Table Check -================ - -Introduction -============ - -Page table check allows to harden the kernel by ensuring that some types of -the memory corruptions are prevented. - -Page table check performs extra verifications at the time when new pages become -accessible from the userspace by getting their page table entries (PTEs PMDs -etc.) added into the table. - -In case of detected corruption, the kernel is crashed. There is a small -performance and memory overhead associated with the page table check. Therefore, -it is disabled by default, but can be optionally enabled on systems where the -extra hardening outweighs the performance costs. Also, because page table check -is synchronous, it can help with debugging double map memory corruption issues, -by crashing kernel at the time wrong mapping occurs instead of later which is -often the case with memory corruptions bugs. - -Double mapping detection logic -============================== - -+-------------------+-------------------+-------------------+------------------+ -| Current Mapping | New mapping | Permissions | Rule | -+===================+===================+===================+==================+ -| Anonymous | Anonymous | Read | Allow | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Anonymous | Read / Write | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Anonymous | Named | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Anonymous | Any | Prohibit | -+-------------------+-------------------+-------------------+------------------+ -| Named | Named | Any | Allow | -+-------------------+-------------------+-------------------+------------------+ - -Enabling Page Table Check -========================= - -Build kernel with: - -- PAGE_TABLE_CHECK=y - Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK - is available. - -- Boot with 'page_table_check=on' kernel parameter. - -Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page -table support without extra kernel parameter. diff --git a/Documentation/vm/page_tables.rst b/Documentation/vm/page_tables.rst deleted file mode 100644 index 96939571d7bc..000000000000 --- a/Documentation/vm/page_tables.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=========== -Page Tables -=========== diff --git a/Documentation/vm/physical_memory.rst b/Documentation/vm/physical_memory.rst deleted file mode 100644 index 2ab7b8c1c863..000000000000 --- a/Documentation/vm/physical_memory.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Physical Memory -=============== diff --git a/Documentation/vm/process_addrs.rst b/Documentation/vm/process_addrs.rst deleted file mode 100644 index e8618fbc62c9..000000000000 --- a/Documentation/vm/process_addrs.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================= -Process Addresses -================= diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst deleted file mode 100644 index 7bef6718e3a9..000000000000 --- a/Documentation/vm/remap_file_pages.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _remap_file_pages: - -============================== -remap_file_pages() system call -============================== - -The remap_file_pages() system call is used to create a nonlinear mapping, -that is, a mapping in which the pages of the file are mapped into a -nonsequential order in memory. The advantage of using remap_file_pages() -over using repeated calls to mmap(2) is that the former approach does not -require the kernel to create additional VMA (Virtual Memory Area) data -structures. - -Supporting of nonlinear mapping requires significant amount of non-trivial -code in kernel virtual memory subsystem including hot paths. Also to get -nonlinear mapping work kernel need a way to distinguish normal page table -entries from entries with file offset (pte_file). Kernel reserves flag in -PTE for this purpose. PTE flags are scarce resource especially on some CPU -architectures. It would be nice to free up the flag for other usage. - -Fortunately, there are not many users of remap_file_pages() in the wild. -It's only known that one enterprise RDBMS implementation uses the syscall -on 32-bit systems to map files bigger than can linearly fit into 32-bit -virtual address space. This use-case is not critical anymore since 64-bit -systems are widely available. - -The syscall is deprecated and replaced it with an emulation now. The -emulation creates new VMAs instead of nonlinear mappings. It's going to -work slower for rare users of remap_file_pages() but ABI is preserved. - -One side effect of emulation (apart from performance) is that user can hit -vm.max_map_count limit more easily due to additional VMAs. See comment for -DEFAULT_MAX_MAP_COUNT for more details on the limit. diff --git a/Documentation/vm/shmfs.rst b/Documentation/vm/shmfs.rst deleted file mode 100644 index 8b01ebb4c30e..000000000000 --- a/Documentation/vm/shmfs.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -======================== -Shared Memory Filesystem -======================== diff --git a/Documentation/vm/slab.rst b/Documentation/vm/slab.rst deleted file mode 100644 index 87d5a5bb172f..000000000000 --- a/Documentation/vm/slab.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -=============== -Slab Allocation -=============== diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst deleted file mode 100644 index 43063ade737a..000000000000 --- a/Documentation/vm/slub.rst +++ /dev/null @@ -1,452 +0,0 @@ -.. _slub: - -========================== -Short users guide for SLUB -========================== - -The basic philosophy of SLUB is very different from SLAB. SLAB -requires rebuilding the kernel to activate debug options for all -slab caches. SLUB always includes full debugging but it is off by default. -SLUB can enable debugging only for selected slabs in order to avoid -an impact on overall system performance which may make a bug more -difficult to find. - -In order to switch debugging on one can add an option ``slub_debug`` -to the kernel command line. That will enable full debugging for -all slabs. - -Typically one would then use the ``slabinfo`` command to get statistical -data and perform operation on the slabs. By default ``slabinfo`` only lists -slabs that have data in them. See "slabinfo -h" for more options when -running the command. ``slabinfo`` can be compiled with -:: - - gcc -o slabinfo tools/vm/slabinfo.c - -Some of the modes of operation of ``slabinfo`` require that slub debugging -be enabled on the command line. F.e. no tracking information will be -available without debugging on and validation can only partially -be performed if debugging was not switched on. - -Some more sophisticated uses of slub_debug: -------------------------------------------- - -Parameters may be given to ``slub_debug``. If none is specified then full -debugging is enabled. Format: - -slub_debug= - Enable options for all slabs - -slub_debug=,,,... - Enable options only for select slabs (no spaces - after a comma) - -Multiple blocks of options for all slabs or selected slabs can be given, with -blocks of options delimited by ';'. The last of "all slabs" blocks is applied -to all slabs except those that match one of the "select slabs" block. Options -of the first "select slabs" blocks that matches the slab's name are applied. - -Possible debug options are:: - - F Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS - Sorry SLAB legacy issues) - Z Red zoning - P Poisoning (object and padding) - U User tracking (free and alloc) - T Trace (please only use on single slabs) - A Enable failslab filter mark for the cache - O Switch debugging off for caches that would have - caused higher minimum slab orders - - Switch all debugging off (useful if the kernel is - configured with CONFIG_SLUB_DEBUG_ON) - -F.e. in order to boot just with sanity checks and red zoning one would specify:: - - slub_debug=FZ - -Trying to find an issue in the dentry cache? Try:: - - slub_debug=,dentry - -to only enable debugging on the dentry cache. You may use an asterisk at the -end of the slab name, in order to cover all slabs with the same prefix. For -example, here's how you can poison the dentry cache as well as all kmalloc -slabs:: - - slub_debug=P,kmalloc-*,dentry - -Red zoning and tracking may realign the slab. We can just apply sanity checks -to the dentry cache with:: - - slub_debug=F,dentry - -Debugging options may require the minimum possible slab order to increase as -a result of storing the metadata (for example, caches with PAGE_SIZE object -sizes). This has a higher liklihood of resulting in slab allocation errors -in low memory situations or if there's high fragmentation of memory. To -switch off debugging for such caches by default, use:: - - slub_debug=O - -You can apply different options to different list of slab names, using blocks -of options. This will enable red zoning for dentry and user tracking for -kmalloc. All other slabs will not get any debugging enabled:: - - slub_debug=Z,dentry;U,kmalloc-* - -You can also enable options (e.g. sanity checks and poisoning) for all caches -except some that are deemed too performance critical and don't need to be -debugged by specifying global debug options followed by a list of slab names -with "-" as options:: - - slub_debug=FZ;-,zs_handle,zspage - -The state of each debug option for a slab can be found in the respective files -under:: - - /sys/kernel/slab// - -If the file contains 1, the option is enabled, 0 means disabled. The debug -options from the ``slub_debug`` parameter translate to the following files:: - - F sanity_checks - Z red_zone - P poison - U store_user - T trace - A failslab - -Careful with tracing: It may spew out lots of information and never stop if -used on the wrong slab. - -Slab merging -============ - -If no debug options are specified then SLUB may merge similar slabs together -in order to reduce overhead and increase cache hotness of objects. -``slabinfo -a`` displays which slabs were merged together. - -Slab validation -=============== - -SLUB can validate all object if the kernel was booted with slub_debug. In -order to do so you must have the ``slabinfo`` tool. Then you can do -:: - - slabinfo -v - -which will test all objects. Output will be generated to the syslog. - -This also works in a more limited way if boot was without slab debug. -In that case ``slabinfo -v`` simply tests all reachable objects. Usually -these are in the cpu slabs and the partial slabs. Full slabs are not -tracked by SLUB in a non debug situation. - -Getting more performance -======================== - -To some degree SLUB's performance is limited by the need to take the -list_lock once in a while to deal with partial slabs. That overhead is -governed by the order of the allocation for each slab. The allocations -can be influenced by kernel parameters: - -.. slub_min_objects=x (default 4) -.. slub_min_order=x (default 0) -.. slub_max_order=x (default 3 (PAGE_ALLOC_COSTLY_ORDER)) - -``slub_min_objects`` - allows to specify how many objects must at least fit into one - slab in order for the allocation order to be acceptable. In - general slub will be able to perform this number of - allocations on a slab without consulting centralized resources - (list_lock) where contention may occur. - -``slub_min_order`` - specifies a minimum order of slabs. A similar effect like - ``slub_min_objects``. - -``slub_max_order`` - specified the order at which ``slub_min_objects`` should no - longer be checked. This is useful to avoid SLUB trying to - generate super large order pages to fit ``slub_min_objects`` - of a slab cache with large object sizes into one high order - page. Setting command line parameter - ``debug_guardpage_minorder=N`` (N > 0), forces setting - ``slub_max_order`` to 0, what cause minimum possible order of - slabs allocation. - -SLUB Debug output -================= - -Here is a sample of slub debug output:: - - ==================================================================== - BUG kmalloc-8: Right Redzone overwritten - -------------------------------------------------------------------- - - INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc - INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58 - INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58 - INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554 - - Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ - Object (0xc90f6d20): 31 30 31 39 2e 30 30 35 1019.005 - Redzone (0xc90f6d28): 00 cc cc cc . - Padding (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ - - [] dump_trace+0x63/0x1eb - [] show_trace_log_lvl+0x1a/0x2f - [] show_trace+0x12/0x14 - [] dump_stack+0x16/0x18 - [] object_err+0x143/0x14b - [] check_object+0x66/0x234 - [] __slab_free+0x239/0x384 - [] kfree+0xa6/0xc6 - [] get_modalias+0xb9/0xf5 - [] dmi_dev_uevent+0x27/0x3c - [] dev_uevent+0x1ad/0x1da - [] kobject_uevent_env+0x20a/0x45b - [] kobject_uevent+0xa/0xf - [] store_uevent+0x4f/0x58 - [] dev_attr_store+0x29/0x2f - [] sysfs_write_file+0x16e/0x19c - [] vfs_write+0xd1/0x15a - [] sys_write+0x3d/0x72 - [] sysenter_past_esp+0x5f/0x99 - [] 0xb7f7b410 - ======================= - - FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc - -If SLUB encounters a corrupted object (full detection requires the kernel -to be booted with slub_debug) then the following output will be dumped -into the syslog: - -1. Description of the problem encountered - - This will be a message in the system log starting with:: - - =============================================== - BUG : - ----------------------------------------------- - - INFO: - - INFO: Slab
- INFO: Object
- INFO: Allocated in age= cpu= pid= - INFO: Freed in age= cpu= - pid= - - (Object allocation / free information is only available if SLAB_STORE_USER is - set for the slab. slub_debug sets that option) - -2. The object contents if an object was involved. - - Various types of lines can follow the BUG SLUB line: - - Bytes b4
: - Shows a few bytes before the object where the problem was detected. - Can be useful if the corruption does not stop with the start of the - object. - - Object
: - The bytes of the object. If the object is inactive then the bytes - typically contain poison values. Any non-poison value shows a - corruption by a write after free. - - Redzone
: - The Redzone following the object. The Redzone is used to detect - writes after the object. All bytes should always have the same - value. If there is any deviation then it is due to a write after - the object boundary. - - (Redzone information is only available if SLAB_RED_ZONE is set. - slub_debug sets that option) - - Padding
: - Unused data to fill up the space in order to get the next object - properly aligned. In the debug case we make sure that there are - at least 4 bytes of padding. This allows the detection of writes - before the object. - -3. A stackdump - - The stackdump describes the location where the error was detected. The cause - of the corruption is may be more likely found by looking at the function that - allocated or freed the object. - -4. Report on how the problem was dealt with in order to ensure the continued - operation of the system. - - These are messages in the system log beginning with:: - - FIX : - - In the above sample SLUB found that the Redzone of an active object has - been overwritten. Here a string of 8 characters was written into a slab that - has the length of 8 characters. However, a 8 character string needs a - terminating 0. That zero has overwritten the first byte of the Redzone field. - After reporting the details of the issue encountered the FIX SLUB message - tells us that SLUB has restored the Redzone to its proper value and then - system operations continue. - -Emergency operations -==================== - -Minimal debugging (sanity checks alone) can be enabled by booting with:: - - slub_debug=F - -This will be generally be enough to enable the resiliency features of slub -which will keep the system running even if a bad kernel component will -keep corrupting objects. This may be important for production systems. -Performance will be impacted by the sanity checks and there will be a -continual stream of error messages to the syslog but no additional memory -will be used (unlike full debugging). - -No guarantees. The kernel component still needs to be fixed. Performance -may be optimized further by locating the slab that experiences corruption -and enabling debugging only for that cache - -I.e.:: - - slub_debug=F,dentry - -If the corruption occurs by writing after the end of the object then it -may be advisable to enable a Redzone to avoid corrupting the beginning -of other objects:: - - slub_debug=FZ,dentry - -Extended slabinfo mode and plotting -=================================== - -The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes: - - Slabcache Totals - - Slabs sorted by size (up to -N slabs, default 1) - - Slabs sorted by loss (up to -N slabs, default 1) - -Additionally, in this mode ``slabinfo`` does not dynamically scale -sizes (G/M/K) and reports everything in bytes (this functionality is -also available to other slabinfo modes via '-B' option) which makes -reporting more precise and accurate. Moreover, in some sense the `-X' -mode also simplifies the analysis of slabs' behaviour, because its -output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it -pushes the analysis from looking through the numbers (tons of numbers) -to something easier -- visual analysis. - -To generate plots: - -a) collect slabinfo extended records, for example:: - - while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done - -b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script:: - - slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN] - - The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records - and generates 3 png files (and 3 pre-processing cache files) per STATS - file: - - Slabcache Totals: FOO_STATS-totals.png - - Slabs sorted by size: FOO_STATS-slabs-by-size.png - - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png - -Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you -need to compare slabs' behaviour "prior to" and "after" some code -modification. To help you out there, ``slabinfo-gnuplot.sh`` script -can 'merge' the `Slabcache Totals` sections from different -measurements. To visually compare N plots: - -a) Collect as many STATS1, STATS2, .. STATSN files as you need:: - - while [ 1 ]; do slabinfo -X >> STATS; sleep 1; done - -b) Pre-process those STATS files:: - - slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN - -c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the - generated pre-processed \*-totals:: - - slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals - - This will produce a single plot (png file). - - Plots, expectedly, can be large so some fluctuations or small spikes - can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two - options to 'zoom-in'/'zoom-out': - - a) ``-s %d,%d`` -- overwrites the default image width and height - b) ``-r %d,%d`` -- specifies a range of samples to use (for example, - in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r - 40,60`` range will plot only samples collected between 40th and - 60th seconds). - - -DebugFS files for SLUB -====================== - -For more information about current state of SLUB caches with the user tracking -debug option enabled, debugfs files are available, typically under -/sys/kernel/debug/slab// (created only for caches with enabled user -tracking). There are 2 types of these files with the following debug -information: - -1. alloc_traces:: - - Prints information about unique allocation traces of the currently - allocated objects. The output is sorted by frequency of each trace. - - Information in the output: - Number of objects, allocating function, minimal/average/maximal jiffies since alloc, - pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. - - Example::: - - 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: - __slab_alloc+0x6d/0x90 - kmem_cache_alloc_trace+0x2eb/0x300 - populate_error_injection_list+0x97/0x110 - init_error_injection+0x1b/0x71 - do_one_initcall+0x5f/0x2d0 - kernel_init_freeable+0x26f/0x2d7 - kernel_init+0xe/0x118 - ret_from_fork+0x22/0x30 - - -2. free_traces:: - - Prints information about unique freeing traces of the currently allocated - objects. The freeing traces thus come from the previous life-cycle of the - objects and are reported as not available for objects allocated for the first - time. The output is sorted by frequency of each trace. - - Information in the output: - Number of objects, freeing function, minimal/average/maximal jiffies since free, - pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. - - Example::: - - 1980 age=4294912290 pid=0 cpus=0 - 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 - kfree+0x2db/0x420 - acpi_ut_update_ref_count+0x6a6/0x782 - acpi_ut_update_object_reference+0x1ad/0x234 - acpi_ut_remove_reference+0x7d/0x84 - acpi_rs_get_prt_method_data+0x97/0xd6 - acpi_get_irq_routing_table+0x82/0xc4 - acpi_pci_irq_find_prt_entry+0x8e/0x2e0 - acpi_pci_irq_lookup+0x3a/0x1e0 - acpi_pci_irq_enable+0x77/0x240 - pcibios_enable_device+0x39/0x40 - do_pci_enable_device.part.0+0x5d/0xe0 - pci_enable_device_flags+0xfc/0x120 - pci_enable_device+0x13/0x20 - virtio_pci_probe+0x9e/0x170 - local_pci_probe+0x48/0x80 - pci_device_probe+0x105/0x1c0 - -Christoph Lameter, May 30, 2007 -Sergey Senozhatsky, October 23, 2015 diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst deleted file mode 100644 index c08919662704..000000000000 --- a/Documentation/vm/split_page_table_lock.rst +++ /dev/null @@ -1,100 +0,0 @@ -.. _split_page_table_lock: - -===================== -Split page table lock -===================== - -Originally, mm->page_table_lock spinlock protected all page tables of the -mm_struct. But this approach leads to poor page fault scalability of -multi-threaded applications due high contention on the lock. To improve -scalability, split page table lock was introduced. - -With split page table lock we have separate per-table lock to serialize -access to the table. At the moment we use split lock for PTE and PMD -tables. Access to higher level tables protected by mm->page_table_lock. - -There are helpers to lock/unlock a table and other accessor functions: - - - pte_offset_map_lock() - maps pte and takes PTE table lock, returns pointer to the taken - lock; - - pte_unmap_unlock() - unlocks and unmaps PTE table; - - pte_alloc_map_lock() - allocates PTE table if needed and take the lock, returns pointer - to taken lock or NULL if allocation failed; - - pte_lockptr() - returns pointer to PTE table lock; - - pmd_lock() - takes PMD table lock, returns pointer to taken lock; - - pmd_lockptr() - returns pointer to PMD table lock; - -Split page table lock for PTE tables is enabled compile-time if -CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. -If split lock is disabled, all tables are guarded by mm->page_table_lock. - -Split page table lock for PMD tables is enabled, if it's enabled for PTE -tables and the architecture supports it (see below). - -Hugetlb and split page table lock -================================= - -Hugetlb can support several page sizes. We use split lock only for PMD -level, but not for PUD. - -Hugetlb-specific helpers: - - - huge_pte_lock() - takes pmd split lock for PMD_SIZE page, mm->page_table_lock - otherwise; - - huge_pte_lockptr() - returns pointer to table lock; - -Support of split page table lock by an architecture -=================================================== - -There's no need in special enabling of PTE split page table lock: everything -required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which -must be called on PTE table allocation / freeing. - -Make sure the architecture doesn't use slab allocator for page table -allocation: slab uses page->slab_cache for its pages. -This field shares storage with page->ptl. - -PMD split lock only makes sense if you have more than two page table -levels. - -PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table -allocation and pgtable_pmd_page_dtor() on freeing. - -Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and -pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing -paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). - -With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. - -NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must -be handled properly. - -page->ptl -========= - -page->ptl is used to access split page table lock, where 'page' is struct -page of page containing the table. It shares storage with page->private -(and few other fields in union). - -To avoid increasing size of struct page and have best performance, we use a -trick: - - - if spinlock_t fits into long, we use page->ptr as spinlock, so we - can avoid indirect access and save a cache line. - - if size of spinlock_t is bigger then size of long, we use page->ptl as - pointer to spinlock_t and allocate it dynamically. This allows to use - split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs - one more cache line for indirect access; - -The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in -pgtable_pmd_page_ctor() for PMD table. - -Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/vm/swap.rst b/Documentation/vm/swap.rst deleted file mode 100644 index 78819bd4d745..000000000000 --- a/Documentation/vm/swap.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -==== -Swap -==== diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst deleted file mode 100644 index 216db1d67d04..000000000000 --- a/Documentation/vm/transhuge.rst +++ /dev/null @@ -1,187 +0,0 @@ -.. _transhuge: - -============================ -Transparent Hugepage Support -============================ - -This document describes design principles for Transparent Hugepage (THP) -support and its interaction with other parts of the memory management -system. - -Design principles -================= - -- "graceful fallback": mm components which don't have transparent hugepage - knowledge fall back to breaking huge pmd mapping into table of ptes and, - if necessary, split a transparent hugepage. Therefore these components - can continue working on the regular pages or regular pte mappings. - -- if a hugepage allocation fails because of memory fragmentation, - regular pages should be gracefully allocated instead and mixed in - the same vma without any failure or significant delay and without - userland noticing - -- if some task quits and more hugepages become available (either - immediately in the buddy or through the VM), guest physical memory - backed by regular pages should be relocated on hugepages - automatically (with khugepaged) - -- it doesn't require memory reservation and in turn it uses hugepages - whenever possible (the only possible reservation here is kernelcore= - to avoid unmovable pages to fragment all the memory but such a tweak - is not specific to transparent hugepage support and it's a generic - feature that applies to all dynamic high order allocations in the - kernel) - -get_user_pages and follow_page -============================== - -get_user_pages and follow_page if run on a hugepage, will return the -head or tail pages as usual (exactly as they would do on -hugetlbfs). Most GUP users will only care about the actual physical -address of the page and its temporary pinning to release after the I/O -is complete, so they won't ever notice the fact the page is huge. But -if any driver is going to mangle over the page structure of the tail -page (like for checking page->mapping or other bits that are relevant -for the head page and not the tail page), it should be updated to jump -to check head page instead. Taking a reference on any head/tail page would -prevent the page from being split by anyone. - -.. note:: - these aren't new constraints to the GUP API, and they match the - same constraints that apply to hugetlbfs too, so any driver capable - of handling GUP on hugetlbfs will also work fine on transparent - hugepage backed mappings. - -Graceful fallback -================= - -Code walking pagetables but unaware about huge pmds can simply call -split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by -pmd_offset. It's trivial to make the code transparent hugepage aware -by just grepping for "pmd_offset" and adding split_huge_pmd where -missing after pmd_offset returns the pmd. Thanks to the graceful -fallback design, with a one liner change, you can avoid to write -hundreds if not thousands of lines of complex code to make your code -hugepage aware. - -If you're not walking pagetables but you run into a physical hugepage -that you can't handle natively in your code, you can split it by -calling split_huge_page(page). This is what the Linux VM does before -it tries to swapout the hugepage for example. split_huge_page() can fail -if the page is pinned and you must handle this correctly. - -Example to make mremap.c transparent hugepage aware with a one liner -change:: - - diff --git a/mm/mremap.c b/mm/mremap.c - --- a/mm/mremap.c - +++ b/mm/mremap.c - @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru - return NULL; - - pmd = pmd_offset(pud, addr); - + split_huge_pmd(vma, pmd, addr); - if (pmd_none_or_clear_bad(pmd)) - return NULL; - -Locking in hugepage aware code -============================== - -We want as much code as possible hugepage aware, as calling -split_huge_page() or split_huge_pmd() has a cost. - -To make pagetable walks huge pmd aware, all you need to do is to call -pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the -mmap_lock in read (or write) mode to be sure a huge pmd cannot be -created from under you by khugepaged (khugepaged collapse_huge_page -takes the mmap_lock in write mode in addition to the anon_vma lock). If -pmd_trans_huge returns false, you just fallback in the old code -paths. If instead pmd_trans_huge returns true, you have to take the -page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the -page table lock will prevent the huge pmd being converted into a -regular pmd from under you (split_huge_pmd can run in parallel to the -pagetable walk). If the second pmd_trans_huge returns false, you -should just drop the page table lock and fallback to the old code as -before. Otherwise, you can proceed to process the huge pmd and the -hugepage natively. Once finished, you can drop the page table lock. - -Refcounts and transparent huge pages -==================================== - -Refcounting on THP is mostly consistent with refcounting on other compound -pages: - - - get_page()/put_page() and GUP operate on head page's ->_refcount. - - - ->_refcount in tail pages is always zero: get_page_unless_zero() never - succeeds on tail pages. - - - map/unmap of the pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page. - - - map/unmap of the whole compound page is accounted for in compound_mapcount - (stored in first tail page). For file huge pages, we also increment - ->_mapcount of all sub-pages in order to have race-free detection of - last unmap of subpages. - -PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. - -For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all -subpages is offset up by one. This additional reference is required to -get race-free detection of unmap of subpages when we have them mapped with -both PMDs and PTEs. - -This optimization is required to lower the overhead of per-subpage mapcount -tracking. The alternative is to alter ->_mapcount in all subpages on each -map/unmap of the whole compound page. - -For anonymous pages, we set PG_double_map when a PMD of the page is split -for the first time, but still have a PMD mapping. The additional references -go away with the last compound_mapcount. - -File pages get PG_double_map set on the first map of the page with PTE and -goes away when the page gets evicted from the page cache. - -split_huge_page internally has to distribute the refcounts in the head -page to the tail pages before clearing all PG_head/tail bits from the page -structures. It can be done easily for refcounts taken by page table -entries, but we don't have enough information on how to distribute any -additional pins (i.e. from get_user_pages). split_huge_page() fails any -requests to split pinned huge pages: it expects page count to be equal to -the sum of mapcount of all sub-pages plus one (split_huge_page caller must -have a reference to the head page). - -split_huge_page uses migration entries to stabilize page->_refcount and -page->_mapcount of anonymous pages. File pages just get unmapped. - -We are safe against physical memory scanners too: the only legitimate way -a scanner can get a reference to a page is get_page_unless_zero(). - -All tail pages have zero ->_refcount until atomic_add(). This prevents the -scanner from getting a reference to the tail page up to that point. After the -atomic_add() we don't care about the ->_refcount value. We already know how -many references should be uncharged from the head page. - -For head page get_page_unless_zero() will succeed and we don't mind. It's -clear where references should go after split: it will stay on the head page. - -Note that split_huge_pmd() doesn't have any limitations on refcounting: -pmd can be split at any point and never fails. - -Partial unmap and deferred_split_huge_page() -============================================ - -Unmapping part of THP (with munmap() or other way) is not going to free -memory immediately. Instead, we detect that a subpage of THP is not in use -in page_remove_rmap() and queue the THP for splitting if memory pressure -comes. Splitting will free up unused subpages. - -Splitting the page right away is not an option due to locking context in -the place where we can detect partial unmap. It also might be -counterproductive since in many cases partial unmap happens during exit(2) if -a THP crosses a VMA boundary. - -The function deferred_split_huge_page() is used to queue a page for splitting. -The splitting itself will happen when we get memory pressure via shrinker -interface. diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst deleted file mode 100644 index b280367d6a44..000000000000 --- a/Documentation/vm/unevictable-lru.rst +++ /dev/null @@ -1,554 +0,0 @@ -.. _unevictable_lru: - -============================== -Unevictable LRU Infrastructure -============================== - -.. contents:: :local: - - -Introduction -============ - -This document describes the Linux memory manager's "Unevictable LRU" -infrastructure and the use of this to manage several types of "unevictable" -pages. - -The document attempts to provide the overall rationale behind this mechanism -and the rationale for some of the design decisions that drove the -implementation. The latter design rationale is discussed in the context of an -implementation description. Admittedly, one can obtain the implementation -details - the "what does it do?" - by reading the code. One hopes that the -descriptions below add value by provide the answer to "why does it do that?". - - - -The Unevictable LRU -=================== - -The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page -reclaim in Linux. The problems have been observed at customer sites on large -memory x86_64 systems. - -To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of -main memory will have over 32 million 4k pages in a single node. When a large -fraction of these pages are not evictable for any reason [see below], vmscan -will spend a lot of time scanning the LRU lists looking for the small fraction -of pages that are evictable. This can result in a situation where all CPUs are -spending 100% of their time in vmscan for hours or days on end, with the system -completely unresponsive. - -The unevictable list addresses the following classes of unevictable pages: - - * Those owned by ramfs. - - * Those mapped into SHM_LOCK'd shared memory regions. - - * Those mapped into VM_LOCKED [mlock()ed] VMAs. - -The infrastructure may also be able to handle other conditions that make pages -unevictable, either by definition or by circumstance, in the future. - - -The Unevictable LRU Page List ------------------------------ - -The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a -companion to the LRU-ordered anonymous and file, active and inactive page lists; -and now it is not even a page list. But following familiar convention, here in -this document and in the source, we often imagine it as a fifth LRU page list. - -The Unevictable LRU infrastructure consists of an additional, per-node, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. - -The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when -PG_lru is set. - -The Unevictable LRU infrastructure maintains unevictable pages as if they were -on an additional LRU list for a few reasons: - - (1) We get to "treat unevictable pages just like we treat other pages in the - system - which means we get to use the same code to manipulate them, the - same code to isolate them (for migrate, etc.), the same code to keep track - of the statistics, etc..." [Rik van Riel] - - (2) We want to be able to migrate unevictable pages between nodes for memory - defragmentation, workload management and memory hotplug. The Linux kernel - can only migrate pages that it can successfully isolate from the LRU - lists (or "Movable" pages: outside of consideration here). If we were to - maintain pages elsewhere than on an LRU-like list, where they can be - detected by isolate_lru_page(), we would prevent their migration. - -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. - -The unevictable list benefits from the "arrayification" of the per-node LRU -lists and statistics originally proposed and posted by Christoph Lameter. - - -Memory Control Group Interaction --------------------------------- - -The unevictable LRU facility interacts with the memory control group [aka -memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by -extending the lru_list enum. - -The memory controller data structure automatically gets a per-node unevictable -list as a result of the "arrayification" of the per-node LRU lists (one per -lru_list enum element). The memory controller tracks the movement of pages to -and from the unevictable list. - -When a memory control group comes under memory pressure, the controller will -not attempt to reclaim pages on the unevictable list. This has a couple of -effects: - - (1) Because the pages are "hidden" from reclaim on the unevictable list, the - reclaim process can be more efficient, dealing only with pages that have a - chance of being reclaimed. - - (2) On the other hand, if too many of the pages charged to the control group - are unevictable, the evictable portion of the working set of the tasks in - the control group may not fit into the available memory. This can cause - the control group to thrash or to OOM-kill tasks. - - -.. _mark_addr_space_unevict: - -Marking Address Spaces Unevictable ----------------------------------- - -For facilities such as ramfs none of the pages attached to the address space -may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE -address space flag is provided, and this can be manipulated by a filesystem -using a number of wrapper functions: - - * ``void mapping_set_unevictable(struct address_space *mapping);`` - - Mark the address space as being completely unevictable. - - * ``void mapping_clear_unevictable(struct address_space *mapping);`` - - Mark the address space as being evictable. - - * ``int mapping_unevictable(struct address_space *mapping);`` - - Query the address space, and return true if it is completely - unevictable. - -These are currently used in three places in the kernel: - - (1) By ramfs to mark the address spaces of its inodes when they are created, - and this mark remains for the life of the inode. - - (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. - Note that SHM_LOCK is not required to page in the locked pages if they're - swapped out; the application must touch the pages manually if it wants to - ensure they're in memory. - - (3) By the i915 driver to mark pinned address space until it's unpinned. The - amount of unevictable memory marked by i915 driver is roughly the bounded - object size in debugfs/dri/0/i915_gem_objects. - - -Detecting Unevictable Pages ---------------------------- - -The function page_evictable() in mm/internal.h determines whether a page is -evictable or not using the query function outlined above [see section -:ref:`Marking address spaces unevictable `] -to check the AS_UNEVICTABLE flag. - -For address spaces that are so marked after being populated (as SHM regions -might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate -the page tables for the region as does, for example, mlock(), nor need it make -any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during -a reclamation scan. - -On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan -the pages in the region and "rescue" them from the unevictable list if no other -condition is keeping them unevictable. If an unevictable region is destroyed, -the pages are also "rescued" from the unevictable list in the process of -freeing them. - -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. - - -Vmscan's Handling of Unevictable Pages --------------------------------------- - -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they -have become evictable again (via munlock() for example) and have been "rescued" -from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave an unevictable page on one of the regular -active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the -unevictable list for the memory cgroup and node being scanned. - -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_active_list() or shrink_page_list() where they will be detected when -vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page -is culled to the unevictable list when it is released by the shrinker. - -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the -unevictable state of a page before placing it on the unevictable list. - - -MLOCKED Pages -============= - -The unevictable page list is also useful for mlock(), in addition to ramfs and -SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in -NOMMU situations, all mappings are effectively mlocked. - - -History -------- - -The "Unevictable mlocked Pages" infrastructure is based on work originally -posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". -Nick posted his patch as an alternative to a patch posted by Christoph Lameter -to achieve the same objective: hiding mlocked pages from vmscan. - -In Nick's patch, he used one of the struct page LRU list link fields as a count -of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years -earlier). But this use of the link field for a count prevented the management -of the pages on an LRU list, and thus mlocked pages were not migratable as -isolate_lru_page() could not detect them, and the LRU list link field was not -available to the migration subsystem. - -Nick resolved this by putting mlocked pages back on the LRU list before -attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When -Nick's patch was integrated with the Unevictable LRU work, the count was -replaced by walking the reverse map when munlocking, to determine whether any -other VM_LOCKED VMAs still mapped the page. - -However, walking the reverse map for each page when munlocking was ugly and -inefficient, and could lead to catastrophic contention on a file's rmap lock, -when many processes which had it mlocked were trying to exit. In 5.18, the -idea of keeping mlock_count in Unevictable LRU list link field was revived and -put to work, without preventing the migration of mlocked pages. This is why -the "Unevictable LRU list" cannot be a linked list of pages now; but there was -no use for that linked list anyway - though its size is maintained for meminfo. - - -Basic Management ----------------- - -mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable -pages. When such a page has been "noticed" by the memory management subsystem, -the page is marked with the PG_mlocked flag. This can be manipulated using the -PageMlocked() functions. - -A PG_mlocked page will be placed on the unevictable list when it is added to -the LRU. Such pages can be "noticed" by memory management in several places: - - (1) in the mlock()/mlock2()/mlockall() system call handlers; - - (2) in the mmap() system call handler when mmapping a region with the - MAP_LOCKED flag; - - (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE - flag; - - (4) in the fault path and when a VM_LOCKED stack segment is expanded; or - - (5) as mentioned above, in vmscan:shrink_page_list() when attempting to - reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap(). - -mlocked pages become unlocked and rescued from the unevictable list when: - - (1) mapped in a range unlocked via the munlock()/munlockall() system calls; - - (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including - unmapping at task exit; - - (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; - or - - (4) before a page is COW'd in a VM_LOCKED VMA. - - -mlock()/mlock2()/mlockall() System Call Handling ------------------------------------------------- - -mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup() -for each VMA in the range specified by the call. In the case of mlockall(), -this is the entire active address space of the task. Note that mlock_fixup() -is used for both mlocking and munlocking a range of memory. A call to mlock() -an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is -treated as a no-op and mlock_fixup() simply returns. - -If the VMA passes some filtering as described in "Filtering Special VMAs" -below, mlock_fixup() will attempt to merge the VMA with its neighbors or split -off a subset of the VMA if the range does not cover the entire VMA. Any pages -already present in the VMA are then marked as mlocked by mlock_page() via -mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). - -Before returning from the system call, do_mlock() or mlockall() will call -__mm_populate() to fault in the remaining pages via get_user_pages() and to -mark those pages as mlocked as they are faulted. - -Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, -get_user_pages() will be unable to fault in the pages. That's okay. If pages -do end up getting faulted into this VM_LOCKED VMA, they will be handled in the -fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. - -For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED -(unless it is a PTE mapping of a part of a transparent huge page). Or when -it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() -calls mlock_new_page() instead: similar to mlock_page(), but can make better -judgments, since this page is held exclusively and known not to be on LRU yet. - -mlock_page() sets PageMlocked immediately, then places the page on the CPU's -mlock pagevec, to batch up the rest of the work to be done under lru_lock by -__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count -and moves the page to unevictable state ("the unevictable LRU", but with -mlock_count in place of LRU threading). Or if the page was already PageLRU -and PageUnevictable and PageMlocked, it simply increments the mlock_count. - -But in practice that may not work ideally: the page may not yet be on an LRU, or -it may have been temporarily isolated from LRU. In such cases the mlock_count -field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() -returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: -rather than risk stranding a page indefinitely as unevictable, always err with -mlock_count on the low side, so that when munlocked the page will be rescued to -an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a -VM_LOCKED VMA. - - -Filtering Special VMAs ----------------------- - -mlock_fixup() filters several classes of "special" VMAs: - -1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind - these mappings are inherently pinned, so we don't need to mark them as - mlocked. In any case, most of the pages have no struct page in which to so - mark the page. Because of this, get_user_pages() will fail for these VMAs, - so there is no sense in attempting to visit them. - -2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We - neither need nor want to mlock() these pages. But __mm_populate() includes - hugetlbfs ranges, allocating the huge pages and populating the PTEs. - -3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, - such as the VDSO page, relay channel pages, etc. These pages are inherently - unevictable and are not managed on the LRU lists. __mm_populate() includes - these ranges, populating the PTEs if not already populated. - -4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate() - includes these ranges, populating the PTEs if not already populated. - -Note that for all of these special VMAs, mlock_fixup() does not set the -VM_LOCKED flag. Therefore, we won't have to deal with them later during -munlock(), munmap() or task exit. Neither does mlock_fixup() account these -VMAs against the task's "locked_vm". - - -munlock()/munlockall() System Call Handling -------------------------------------------- - -The munlock() and munlockall() system calls are handled by the same -mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are. -If called to munlock an already munlocked VMA, mlock_fixup() simply returns. -Because of the VMA filtering discussed above, VM_LOCKED will not be set in -any "special" VMAs. So, those VMAs will be ignored for munlock. - -If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. All pages in the VMA are then munlocked by munlock_page() via -mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same -function used when mlocking a VMA range, with new flags for the VMA indicating -that it is munlock() being performed. - -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. - -But in practice that may not work ideally: the page may not yet have reached -"the unevictable LRU", or it may have been temporarily isolated from it. In -those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked -again later if vmscan finds it in a VM_LOCKED VMA. - - -Migrating MLOCKED Pages ------------------------ - -A page that is being migrated has been isolated from the LRU lists and is held -locked across unmapping of the page, updating the page's address space entry -and copying the contents and state, until the page table entry has been -replaced with an entry that refers to the new page. Linux supports migration -of mlocked pages and other unevictable pages. PG_mlocked is cleared from the -the old page when it is unmapped from the last VM_LOCKED VMA, and set when the -new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page -was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the -page was unevictable for other reasons, PG_unevictable is copied explicitly. - -Note that page migration can race with mlocking or munlocking of the same page. -There is mostly no problem since page migration requires unmapping all PTEs of -the old page (including munlock where VM_LOCKED), then mapping in the new page -(including mlock where VM_LOCKED). The page table locks provide sufficient -synchronization. - -However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, -before mlocking any pages already present, if one of those pages were migrated -before mlock_pte_range() reached it, it would get counted twice in mlock_count. -To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, -so that mlock_vma_page() will skip it. - -To complete page migration, we place the old and new pages back onto the LRU -afterwards. The "unneeded" page - old page on success, new page on failure - -is freed when the reference count held by the migration process is released. - - -Compacting MLOCKED Pages ------------------------- - -The memory map can be scanned for compactable regions and the default behavior -is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed -controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work -of compaction is mostly handled by the page migration code and the same work -flow as described in Migrating MLOCKED Pages will apply. - - -MLOCKING Transparent Huge Pages -------------------------------- - -A transparent huge page is represented by a single entry on an LRU list. -Therefore, we can only make unevictable an entire compound page, not -individual subpages. - -If a user tries to mlock() part of a huge page, and no user mlock()s the -whole of the huge page, we want the rest of the page to be reclaimable. - -We cannot just split the page on partial mlock() as split_huge_page() can -fail and a new intermittent failure mode for the syscall is undesirable. - -We handle this by keeping PTE-mlocked huge pages on evictable LRU lists: -the PMD on the border of a VM_LOCKED VMA will be split into a PTE table. - -This way the huge page is accessible for vmscan. Under memory pressure the -page will be split, subpages which belong to VM_LOCKED VMAs will be moved -to the unevictable LRU and the rest can be reclaimed. - -/proc/meminfo's Unevictable and Mlocked amounts do not include those parts -of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs. - - -mmap(MAP_LOCKED) System Call Handling -------------------------------------- - -In addition to the mlock(), mlock2() and mlockall() system calls, an application -can request that a region of memory be mlocked by supplying the MAP_LOCKED flag -to the mmap() call. There is one important and subtle difference here, though. -mmap() + mlock() will fail if the range cannot be faulted in (e.g. because -mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. -The mmaped area will still have properties of the locked area - pages will not -get swapped out - but major page faults to fault memory in might still happen. - -Furthermore, any mmap() call or brk() call that expands the heap by a task -that has previously called mlockall() with the MCL_FUTURE flag will result -in the newly mapped memory being mlocked. Before the unevictable/mlock -changes, the kernel simply called make_pages_present() to allocate pages -and populate the page table. - -To mlock a range of memory under the unevictable/mlock infrastructure, -the mmap() handler and task address space expansion functions call -populate_vma_page_range() specifying the vma and the address range to mlock. - - -munmap()/exit()/exec() System Call Handling -------------------------------------------- - -When unmapping an mlocked region of memory, whether by an explicit call to -munmap() or via an internal unmap from exit() or exec() processing, we must -munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. -Before the unevictable/mlock changes, mlocking did not mark the pages in any -way, so unmapping them required no processing. - -For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED -(unless it was a PTE mapping of a part of a transparent huge page). - -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. - -But in practice that may not work ideally: the page may not yet have reached -"the unevictable LRU", or it may have been temporarily isolated from it. In -those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked -again later if vmscan finds it in a VM_LOCKED VMA. - - -Truncating MLOCKED Pages ------------------------- - -File truncation or hole punching forcibly unmaps the deleted pages from -userspace; truncation even unmaps and deletes any private anonymous pages -which had been Copied-On-Write from the file pages now being truncated. - -Mlocked pages can be munlocked and deleted in this way: like with munmap(), -for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED -(unless it was a PTE mapping of a part of a transparent huge page). - -However, if there is a racing munlock(), since mlock_vma_pages_range() starts -munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages -present, if one of those pages were unmapped by truncation or hole punch before -mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, -and would not be counted out of mlock_count. In this rare case, a page may -still appear as PageMlocked after it has been fully unmapped: and it is left to -release_pages() (or __page_cache_release()) to clear it and update statistics -before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, -which is usually 0). - - -Page Reclaim in shrink_*_list() -------------------------------- - -vmscan's shrink_active_list() culls any obviously unevictable pages - -i.e. !page_evictable(page) pages - diverting those to the unevictable list. -However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive LRU lists. Note that these pages do not have PageUnevictable -set - otherwise they would be on the unevictable list and shrink_active_list() -would never see them. - -Some examples of these unevictable pages on the LRU lists are: - - (1) ramfs pages that have been placed on the LRU lists when first allocated. - - (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to - allocate or fault in the pages in the shared memory region. This happens - when an application accesses the page the first time after SHM_LOCK'ing - the segment. - - (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, - but events left mlock_count too low, so they were munlocked too early. - -vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously -unevictable pages found on the inactive lists to the appropriate memory cgroup -and node unevictable list. - -rmap's page_referenced_one(), called via vmscan's shrink_active_list() or -shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), -check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() -to correct them. Such pages are culled to the unevictable list when released -by the shrinker. diff --git a/Documentation/vm/vmalloc.rst b/Documentation/vm/vmalloc.rst deleted file mode 100644 index 363fe20d6b9f..000000000000 --- a/Documentation/vm/vmalloc.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================================== -Virtually Contiguous Memory Allocation -====================================== diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst deleted file mode 100644 index fc8c67833af6..000000000000 --- a/Documentation/vm/vmalloced-kernel-stacks.rst +++ /dev/null @@ -1,153 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -===================================== -Virtually Mapped Kernel Stack Support -===================================== - -:Author: Shuah Khan - -.. contents:: :local: - -Overview --------- - -This is a compilation of information from the code and original patch -series that introduced the `Virtually Mapped Kernel Stacks feature -` - -Introduction ------------- - -Kernel stack overflows are often hard to debug and make the kernel -susceptible to exploits. Problems could show up at a later time making -it difficult to isolate and root-cause. - -Virtually-mapped kernel stacks with guard pages causes kernel stack -overflows to be caught immediately rather than causing difficult to -diagnose corruptions. - -HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable -support for virtually mapped stacks with guard pages. This feature -causes reliable faults when the stack overflows. The usability of -the stack trace after overflow and response to the overflow itself -is architecture dependent. - -.. note:: - As of this writing, arm64, powerpc, riscv, s390, um, and x86 have - support for VMAP_STACK. - -HAVE_ARCH_VMAP_STACK --------------------- - -Architectures that can support Virtually Mapped Kernel Stacks should -enable this bool configuration option. The requirements are: - -- vmalloc space must be large enough to hold many kernel stacks. This - may rule out many 32-bit architectures. -- Stacks in vmalloc space need to work reliably. For example, if - vmap page tables are created on demand, either this mechanism - needs to work while the stack points to a virtual address with - unpopulated page tables or arch code (switch_to() and switch_mm(), - most likely) needs to ensure that the stack's page table entries - are populated before running on a possibly unpopulated stack. -- If the stack overflows into a guard page, something reasonable - should happen. The definition of "reasonable" is flexible, but - instantly rebooting without logging anything would be unfriendly. - -VMAP_STACK ----------- - -VMAP_STACK bool configuration option when enabled allocates virtually -mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. - -- Enable this if you want the use virtually-mapped kernel stacks - with guard pages. This causes kernel stack overflows to be caught - immediately rather than causing difficult-to-diagnose corruption. - -.. note:: - - Using this feature with KASAN requires architecture support - for backing virtual mappings with real shadow memory, and - KASAN_VMALLOC must be enabled. - -.. note:: - - VMAP_STACK is enabled, it is not possible to run DMA on stack - allocated data. - -Kernel configuration options and dependencies keep changing. Refer to -the latest code base: - -`Kconfig ` - -Allocation ------------ - -When a new kernel thread is created, thread stack is allocated from -virtually contiguous memory pages from the page level allocator. These -pages are mapped into contiguous kernel virtual space with PAGE_KERNEL -protections. - -alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack -with PAGE_KERNEL protections. - -- Allocated stacks are cached and later reused by new threads, so memcg - accounting is performed manually on assigning/releasing stacks to tasks. - Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. -- vm_struct is cached to be able to find when thread free is initiated - in interrupt context. free_thread_stack() can be called in interrupt - context. -- On arm64, all VMAP's stacks need to have the same alignment to ensure - that VMAP'd stack overflow detection works correctly. Arch specific - vmap stack allocator takes care of this detail. -- This does not address interrupt stacks - according to the original patch - -Thread stack allocation is initiated from clone(), fork(), vfork(), -kernel_thread() via kernel_clone(). Leaving a few hints for searching -the code base to understand when and how thread stack is allocated. - -Bulk of the code is in: -`kernel/fork.c `. - -stack_vm_area pointer in task_struct keeps track of the virtually allocated -stack and a non-null stack_vm_area pointer serves as a indication that the -virtually mapped kernel stacks are enabled. - -:: - - struct vm_struct *stack_vm_area; - -Stack overflow handling ------------------------ - -Leading and trailing guard pages help detect stack overflows. When stack -overflows into the guard pages, handlers have to be careful not overflow -the stack again. When handlers are called, it is likely that very little -stack space is left. - -On x86, this is done by handling the page fault indicating the kernel -stack overflow on the double-fault stack. - -Testing VMAP allocation with guard pages ----------------------------------------- - -How do we ensure that VMAP_STACK is actually allocating with a leading -and trailing guard page? The following lkdtm tests can help detect any -regressions. - -:: - - void lkdtm_STACK_GUARD_PAGE_LEADING() - void lkdtm_STACK_GUARD_PAGE_TRAILING() - -Conclusions ------------ - -- A percpu cache of vmalloced stacks appears to be a bit faster than a - high-order stack allocation, at least when the cache hits. -- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and - simply embed the thread_info (containing only flags) and 'int cpu' into - task_struct. -- The thread stack can be free'ed as soon as the task is dead (without - waiting for RCU) and then, if vmapped stacks are in use, cache the - entire stack for reuse on the same cpu. diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst deleted file mode 100644 index c9c495f62d12..000000000000 --- a/Documentation/vm/vmemmap_dedup.rst +++ /dev/null @@ -1,223 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========================================= -A vmemmap diet for HugeTLB and Device DAX -========================================= - -HugeTLB -======= - -The struct page structures (page structs) are used to describe a physical -page frame. By default, there is a one-to-one mapping from a page frame to -it's corresponding page struct. - -HugeTLB pages consist of multiple base page size pages and is supported by many -architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more -details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are -currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page -consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. -For each base page, there is a corresponding page struct. - -Within the HugeTLB subsystem, only the first 4 page structs are used to -contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides -this upper limit. The only 'useful' information in the remaining page structs -is the compound_head field, and this field is the same for all tail pages. - -By removing redundant page structs for HugeTLB pages, memory can be returned -to the buddy allocator for other uses. - -Different architectures support different HugeTLB pages. For example, the -following table is the HugeTLB page size supported by x86 and arm64 -architectures. Because arm64 supports 4k, 16k, and 64k base pages and -supports contiguous entries, so it supports many kinds of sizes of HugeTLB -page. - -+--------------+-----------+-----------------------------------------------+ -| Architecture | Page Size | HugeTLB Page Size | -+--------------+-----------+-----------+-----------+-----------+-----------+ -| x86-64 | 4KB | 2MB | 1GB | | | -+--------------+-----------+-----------+-----------+-----------+-----------+ -| | 4KB | 64KB | 2MB | 32MB | 1GB | -| +-----------+-----------+-----------+-----------+-----------+ -| arm64 | 16KB | 2MB | 32MB | 1GB | | -| +-----------+-----------+-----------+-----------+-----------+ -| | 64KB | 2MB | 512MB | 16GB | | -+--------------+-----------+-----------+-----------+-----------+-----------+ - -When the system boot up, every HugeTLB page has more than one struct page -structs which size is (unit: pages):: - - struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - -Where HugeTLB_Size is the size of the HugeTLB page. We know that the size -of the HugeTLB page is always n times PAGE_SIZE. So we can get the following -relationship:: - - HugeTLB_Size = n * PAGE_SIZE - -Then:: - - struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE - = n * sizeof(struct page) / PAGE_SIZE - -We can use huge mapping at the pud/pmd level for the HugeTLB page. - -For the HugeTLB page of the pmd level mapping, then:: - - struct_size = n * sizeof(struct page) / PAGE_SIZE - = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE - = sizeof(struct page) / sizeof(pte_t) - = 64 / 8 - = 8 (pages) - -Where n is how many pte entries which one page can contains. So the value of -n is (PAGE_SIZE / sizeof(pte_t)). - -This optimization only supports 64-bit system, so the value of sizeof(pte_t) -is 8. And this optimization also applicable only when the size of struct page -is a power of two. In most cases, the size of struct page is 64 bytes (e.g. -x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the -size of struct page structs of it is 8 page frames which size depends on the -size of the base page. - -For the HugeTLB page of the pud level mapping, then:: - - struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd) - = PAGE_SIZE / 8 * 8 (pages) - = PAGE_SIZE (pages) - -Where the struct_size(pmd) is the size of the struct page structs of a -HugeTLB page of the pmd level mapping. - -E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB -HugeTLB page consists in 4096. - -Next, we take the pmd level mapping of the HugeTLB page as an example to -show the internal implementation of this optimization. There are 8 pages -struct page structs associated with a HugeTLB page which is pmd mapped. - -Here is how things look before optimization:: - - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | -------------> | 1 | - | | +-----------+ +-----------+ - | | | 2 | -------------> | 2 | - | | +-----------+ +-----------+ - | | | 3 | -------------> | 3 | - | | +-----------+ +-----------+ - | | | 4 | -------------> | 4 | - | PMD | +-----------+ +-----------+ - | level | | 5 | -------------> | 5 | - | mapping | +-----------+ +-----------+ - | | | 6 | -------------> | 6 | - | | +-----------+ +-----------+ - | | | 7 | -------------> | 7 | - | | +-----------+ +-----------+ - | | - | | - | | - +-----------+ - -The value of page->compound_head is the same for all tail pages. The first -page of page structs (page 0) associated with the HugeTLB page contains the 4 -page structs necessary to describe the HugeTLB. The only use of the remaining -pages of page structs (page 1 to page 7) is to point to page->compound_head. -Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs -will be used for each HugeTLB page. This will allow us to free the remaining -7 pages to the buddy allocator. - -Here is how things look after remapping:: - - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | | - | | | 2 | -----------------+ | | | | | - | | +-----------+ | | | | | - | | | 3 | -------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | ---------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | -----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | -------------------------+ | - | | +-----------+ | - | | | 7 | ---------------------------+ - | | +-----------+ - | | - | | - | | - +-----------+ - -When a HugeTLB is freed to the buddy system, we should allocate 7 pages for -vmemmap pages and restore the previous mapping relationship. - -For the HugeTLB page of the pud level mapping. It is similar to the former. -We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. - -Apart from the HugeTLB page of the pmd/pud level mapping, some architectures -(e.g. aarch64) provides a contiguous bit in the translation table entries -that hints to the MMU to indicate that it is one of a contiguous set of -entries that can be cached in a single TLB entry. - -The contiguous bit is used to increase the mapping size at the pmd and pte -(last) level. So this type of HugeTLB page can be optimized only when its -size of the struct page structs is greater than 1 page. - -Notice: The head vmemmap page is not freed to the buddy allocator and all -tail vmemmap pages are mapped to the head vmemmap page frame. So we can see -more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) -associated with each HugeTLB page. The compound_head() can handle this -correctly (more details refer to the comment above compound_head()). - -Device DAX -========== - -The device-dax interface uses the same tail deduplication technique explained -in the previous chapter, except when used with the vmemmap in -the device (altmap). - -The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), -PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). - -The differences with HugeTLB are relatively minor. - -It only use 3 page structs for storing all information as opposed -to 4 on HugeTLB pages. - -There's no remapping of vmemmap given that device-dax memory is not part of -System RAM ranges initialized at boot. Thus the tail page deduplication -happens at a later stage when we populate the sections. HugeTLB reuses the -the head vmemmap page representing, whereas device-dax reuses the tail -vmemmap page. This results in only half of the savings compared to HugeTLB. - -Deduplicated tail pages are not mapped read-only. - -Here's how things look like on device-dax after the sections are populated:: - - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | -------------> | 1 | - | | +-----------+ +-----------+ - | | | 2 | ----------------^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | - | | | 3 | ------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | --------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | ----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | ------------------------+ | - | | +-----------+ | - | | | 7 | --------------------------+ - | | +-----------+ - | | - | | - | | - +-----------+ diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst deleted file mode 100644 index 224e3c61d686..000000000000 --- a/Documentation/vm/z3fold.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _z3fold: - -====== -z3fold -====== - -z3fold is a special purpose allocator for storing compressed pages. -It is designed to store up to three compressed pages per physical page. -It is a zbud derivative which allows for higher compression -ratio keeping the simplicity and determinism of its predecessor. - -The main differences between z3fold and zbud are: - -* unlike zbud, z3fold allows for up to PAGE_SIZE allocations -* z3fold can hold up to 3 compressed pages in its page -* z3fold doesn't export any API itself and is thus intended to be used - via the zpool API. - -To keep the determinism and simplicity, z3fold, just like zbud, always -stores an integral number of compressed pages per page, but it can store -up to 3 pages unlike zbud which can store at most 2. Therefore the -compression ratio goes to around 2.7x while zbud's one is around 1.7x. - -Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not -return a dereferenceable pointer. Instead, it returns an unsigned long -handle which encodes actual location of the allocated object. - -Keeping effective compression ratio close to zsmalloc's, z3fold doesn't -depend on MMU enabled and provides more predictable reclaim behavior -which makes it a better fit for small and response-critical systems. diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst deleted file mode 100644 index 6e79893d6132..000000000000 --- a/Documentation/vm/zsmalloc.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. _zsmalloc: - -======== -zsmalloc -======== - -This allocator is designed for use with zram. Thus, the allocator is -supposed to work well under low memory conditions. In particular, it -never attempts higher order page allocation which is very likely to -fail under memory pressure. On the other hand, if we just use single -(0-order) pages, it would suffer from very high fragmentation -- -any object of size PAGE_SIZE/2 or larger would occupy an entire page. -This was one of the major issues with its predecessor (xvmalloc). - -To overcome these issues, zsmalloc allocates a bunch of 0-order pages -and links them together using various 'struct page' fields. These linked -pages act as a single higher-order page i.e. an object can span 0-order -page boundaries. The code refers to these linked pages as a single entity -called zspage. - -For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE -since this satisfies the requirements of all its current users (in the -worst case, page is incompressible and is thus stored "as-is" i.e. in -uncompressed form). For allocation requests larger than this size, failure -is returned (see zs_malloc). - -Additionally, zs_malloc() does not return a dereferenceable pointer. -Instead, it returns an opaque handle (unsigned long) which encodes actual -location of the allocated object. The reason for this indirection is that -zsmalloc does not keep zspages permanently mapped since that would cause -issues on 32-bit systems where the VA region for kernel space mappings -is very small. So, before using the allocating memory, the object has to -be mapped using zs_map_object() to get a usable pointer and subsequently -unmapped using zs_unmap_object(). - -stat -==== - -With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via -``/sys/kernel/debug/zsmalloc/``. Here is a sample of stat output:: - - # cat /sys/kernel/debug/zsmalloc/zram0/classes - - class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage - ... - ... - 9 176 0 1 186 129 8 4 - 10 192 1 0 2880 2872 135 3 - 11 208 0 1 819 795 42 2 - 12 224 0 1 219 159 12 4 - ... - ... - - -class - index -size - object size zspage stores -almost_empty - the number of ZS_ALMOST_EMPTY zspages(see below) -almost_full - the number of ZS_ALMOST_FULL zspages(see below) -obj_allocated - the number of objects allocated -obj_used - the number of objects allocated to the user -pages_used - the number of pages allocated for the class -pages_per_zspage - the number of 0-order pages to make a zspage - -We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where - -* n = number of allocated objects -* N = total number of objects zspage can store -* f = fullness_threshold_frac(ie, 4 at the moment) - -Similarly, we assign zspage to: - -* ZS_ALMOST_FULL when n > N / f -* ZS_EMPTY when n == 0 -* ZS_FULL when n == N diff --git a/MAINTAINERS b/MAINTAINERS index fe5daf141501..55fb1daa9057 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5526,7 +5526,7 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ -F: Documentation/vm/damon/ +F: Documentation/mm/damon/ F: include/linux/damon.h F: include/trace/events/damon.h F: mm/damon/ @@ -9037,7 +9037,7 @@ HMM - Heterogeneous Memory Management M: Jérôme Glisse L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/hmm.rst +F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* @@ -9135,8 +9135,8 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages F: Documentation/admin-guide/mm/hugetlbpage.rst -F: Documentation/vm/hugetlbfs_reserv.rst -F: Documentation/vm/vmemmap_dedup.rst +F: Documentation/mm/hugetlbfs_reserv.rst +F: Documentation/mm/vmemmap_dedup.rst F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: mm/hugetlb.c @@ -15072,7 +15072,7 @@ M: Pasha Tatashin M: Andrew Morton L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/page_table_check.rst +F: Documentation/mm/page_table_check.rst F: include/linux/page_table_check.h F: mm/page_table_check.c @@ -22158,7 +22158,7 @@ M: Nitin Gupta R: Sergey Senozhatsky L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/zsmalloc.rst +F: Documentation/mm/zsmalloc.rst F: include/linux/zsmalloc.h F: mm/zsmalloc.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 1920d52653b4..db2838cf8c02 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -410,7 +410,7 @@ config ARCH_SPARSEMEM_ENABLE Say Y to support efficient handling of sparse physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. config ARCH_ENABLE_THP_MIGRATION def_bool y diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb9d5fd39d7f..392ff48f77df 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1273,7 +1273,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, * should return true. * We should not call this on a hugetlb entry. We should check for HugeTLB * entry using vma->vm_flags - * The page table walk rule is explained in Documentation/vm/transhuge.rst + * The page table walk rule is explained in Documentation/mm/transhuge.rst */ static inline int pmd_trans_huge(pmd_t pmd) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d5a6f101f843..126a36571667 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -4,7 +4,7 @@ * * Authors: Jérôme Glisse * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. + * See Documentation/mm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -100,7 +100,7 @@ struct hmm_range { }; /* - * Please see Documentation/vm/hmm.rst for how to use the range API. + * Please see Documentation/mm/hmm.rst for how to use the range API. */ int hmm_range_fault(struct hmm_range *range); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8af304f6b504..9f5ee49482de 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -39,7 +39,7 @@ struct vmem_altmap { * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. + * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..d6c06e140277 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -198,7 +198,7 @@ struct mmu_notifier_ops { * invalidate_range_start()/end() notifiers, as * invalidate_range() already catches the points in time when an * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/vm/mmu_notifier.rst + * discussion on this see Documentation/mm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8cd975a8bfeb..2a243616f222 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm) * * Use mmput() to release the reference acquired by mmget(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0c0fed1b348f..95a5b7aa1ae9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,7 +74,7 @@ static inline int current_is_kswapd(void) /* * Unaddressable device memory support. See include/linux/hmm.h and - * Documentation/vm/hmm.rst. Short description is we need struct pages for + * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * diff --git a/mm/Kconfig b/mm/Kconfig index 169e64192e48..c1fa4993a56f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -663,7 +663,7 @@ config KSM the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.rst for more information: KSM is inactive + See Documentation/mm/ksm.rst for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1ab091f49fc0..dc7df1254f0a 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -35,7 +35,7 @@ #include /* - * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics + * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. */ diff --git a/mm/frontswap.c b/mm/frontswap.c index 6f69b044a8cc..1a97610308cb 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -4,7 +4,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of frontswap. See - * Documentation/vm/frontswap.rst for more information. + * Documentation/mm/frontswap.rst for more information. * * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. * Author: Dan Magenheimer diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 834f288b3769..f9b90a8d7dfa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1937,7 +1937,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..b36a4ef87a2e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4875,7 +4875,7 @@ again: * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); @@ -6403,7 +6403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(&range); @@ -7102,7 +7102,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) i_mmap_unlock_write(vma->vm_file->f_mapping); /* * No need to call mmu_notifier_invalidate_range(), see - * Documentation/vm/mmu_notifier.rst. + * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1089ea8a9c98..ba29c15c53d6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -6,7 +6,7 @@ * * Author: Muchun Song * - * See Documentation/vm/vmemmap_dedup.rst + * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt diff --git a/mm/ksm.c b/mm/ksm.c index 54f78c9eecae..8d2dc501c92c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* @@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are replacing a read only page with another * read only page with the same content. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); diff --git a/mm/mmap.c b/mm/mmap.c index 61e6135c54ef..c14d7286a379 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) diff --git a/mm/rmap.c b/mm/rmap.c index 5bcb334cd6f2..65e0a767b837 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ if (ret) cleaned++; @@ -1765,7 +1765,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * to point at a new folio while a device is * still using this folio. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(&folio->page)); } @@ -1775,7 +1775,7 @@ discard: * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) @@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 652f11a05749..3ff88a2eefb8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -752,7 +752,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, /* * Reuse the previous page for the rest of tail pages - * See layout diagram in Documentation/vm/vmemmap_dedup.rst + * See layout diagram in Documentation/mm/vmemmap_dedup.rst */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, diff --git a/mm/util.c b/mm/util.c index 0837570c9225..5df8f2db7ca9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c149427eb1c9..74c3dcecf64d 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -8,7 +8,7 @@ * Or sort by total memory: * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * - * See Documentation/vm/page_owner.rst + * See Documentation/mm/page_owner.rst */ #include -- cgit v1.2.3 From 507db7927cd181d409dd495c8384b8e14c21c600 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sun, 3 Jul 2022 18:08:36 -0700 Subject: mm: rmap: use the correct parameter name for DEFINE_PAGE_VMA_WALK The parameter used by DEFINE_PAGE_VMA_WALK is _page not page, fix the parameter name. It didn't cause any build error, it is probably because the only caller is write_protect_page() from ksm.c, which pass in page. Link: https://lkml.kernel.org/r/20220512174551.81279-1-shy828301@gmail.com Fixes: 2aff7a4755be ("mm: Convert page_vma_mapped_walk to work on PFNs") Signed-off-by: Yang Shi Reviewed-by: Muchun Song Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 9ec23138e410..bf80adca980b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -325,8 +325,8 @@ struct page_vma_mapped_walk { #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(page), \ - .pgoff = page_to_pgoff(page), \ + .nr_pages = compound_nr(_page), \ + .pgoff = page_to_pgoff(_page), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ -- cgit v1.2.3 From c453d8c7d1384d7e1d7f26d3ec0d527092edf801 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 13 May 2022 12:17:05 -0700 Subject: mm/page_vma_mapped.c: check possible huge PMD map with transhuge_vma_suitable() IIUC page_vma_mapped_walk() checks if the vma is possibly huge PMD mapped with transparent_hugepage_active() and "pvmw->nr_pages >= HPAGE_PMD_NR". Actually pvmw->nr_pages is returned by compound_nr() or folio_nr_pages(), so the page should be THP as long as "pvmw->nr_pages >= HPAGE_PMD_NR". And it is guaranteed THP is allocated for valid VMA in the first place. But it may be not PMD mapped if the VMA is file VMA and it is not properly aligned. The transhuge_vma_suitable() is used to do such check, so replace transparent_hugepage_active() to it, which is too heavy and overkilling. Link: https://lkml.kernel.org/r/20220513191705.457775-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++++++-- mm/page_vma_mapped.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index de29821231c9..648cb3ce7099 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,8 +117,10 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { + unsigned long haddr; + /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -126,6 +128,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return false; } + haddr = addr & HPAGE_PMD_MASK; + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; @@ -342,7 +346,7 @@ static inline bool transparent_hugepage_active(struct vm_area_struct *vma) } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { return false; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index c10f839fc410..e971a467fcdf 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -243,7 +243,7 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - transparent_hugepage_active(vma) && + transhuge_vma_suitable(vma, pvmw->address) && (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); -- cgit v1.2.3 From 7ce82f4c3f3ead13a9d9498768e3b1a79975c4d8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:15 +0800 Subject: mm/migration: return errno when isolate_huge_page failed We might fail to isolate huge page due to e.g. the page is under migration which cleared HPageMigratable. We should return errno in this case rather than always return 1 which could confuse the user, i.e. the caller might think all of the memory is migrated while the hugetlb page is left behind. We make the prototype of isolate_huge_page consistent with isolate_lru_page as suggested by Huang Ying and rename isolate_huge_page to isolate_hugetlb as suggested by Muchun to improve the readability. Link: https://lkml.kernel.org/r/20220530113016.16663-4-linmiaohe@huawei.com Fixes: e8db67eb0ded ("mm: migrate: move_pages() supports thp migration") Signed-off-by: Miaohe Lin Suggested-by: Huang Ying Reported-by: kernel test robot (build error) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Howells Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/gup.c | 2 +- mm/hugetlb.c | 11 +++++------ mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 7 ++++--- 7 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e4cff27d1198..756b66ff025e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -170,7 +170,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_huge_page(struct page *page, struct list_head *list); +int isolate_hugetlb(struct page *page, struct list_head *list); int get_hwpoison_huge_page(struct page *page, bool *hugetlb); int get_huge_page_for_hwpoison(unsigned long pfn, int flags); void putback_active_hugepage(struct page *page); @@ -376,9 +376,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_huge_page(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct page *page, struct list_head *list) { - return false; + return -EBUSY; } static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) diff --git a/mm/gup.c b/mm/gup.c index 407a81d5ca03..3129b754ade3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1930,7 +1930,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, * Try to move out any movable page before pinning the range. */ if (folio_test_hugetlb(folio)) { - if (!isolate_huge_page(&folio->page, + if (isolate_hugetlb(&folio->page, &movable_page_list)) isolation_error_count++; continue; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b36a4ef87a2e..dd9a46ccb79c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2766,8 +2766,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - if (!isolate_huge_page(old_page, list)) - ret = -EBUSY; + ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { @@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && isolate_huge_page(head, list)) + if (page_count(head) && !isolate_hugetlb(head, list)) ret = 0; else if (!page_count(head)) ret = alloc_and_dissolve_huge_page(h, head, list); @@ -6960,15 +6959,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -bool isolate_huge_page(struct page *page, struct list_head *list) +int isolate_hugetlb(struct page *page, struct list_head *list) { - bool ret = true; + int ret = 0; spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { - ret = false; + ret = -EBUSY; goto unlock; } ClearHPageMigratable(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index da39ec8afca8..845369f839e1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2178,7 +2178,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool lru = PageLRU(page); if (PageHuge(page)) { - isolated = isolate_huge_page(page, pagelist); + isolated = !isolate_hugetlb(page, pagelist); } else { if (lru) isolated = !isolate_lru_page(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f1a730c4499..84990a14d51a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_huge_page(head, &source); + isolate_hugetlb(head, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d39b01fd52fe..9689919a2829 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { - if (!isolate_huge_page(page, qp->pagelist) && + if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index c83b3ae2e285..1d036dec1328 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -133,7 +133,7 @@ static void putback_movable_page(struct page *page) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_huge_page(). + * and isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -1628,8 +1628,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - isolate_huge_page(page, pagelist); - err = 1; + err = isolate_hugetlb(page, pagelist); + if (!err) + err = 1; } } else { struct page *head; -- cgit v1.2.3 From ad1ac596e8a8c4b06715dfbd89853eb73c9886b2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:16 +0800 Subject: mm/migration: fix potential pte_unmap on an not mapped pte __migration_entry_wait and migration_entry_wait_on_locked assume pte is always mapped from caller. But this is not the case when it's called from migration_entry_wait_huge and follow_huge_pmd. Add a hugetlbfs variant that calls hugetlb_migration_entry_wait(ptep == NULL) to fix this issue. Link: https://lkml.kernel.org/r/20220530113016.16663-5-linmiaohe@huawei.com Fixes: 30dad30922cc ("mm: migration: add migrate_entry_wait_huge()") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Howells Cc: Huang Ying Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swapops.h | 12 ++++++++---- mm/hugetlb.c | 4 ++-- mm/migrate.c | 23 +++++++++++++++++++---- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f24775b41880..bb7afd03a324 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte); +#ifdef CONFIG_HUGETLB_PAGE +extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +#endif #else static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } -static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) { } +#ifdef CONFIG_HUGETLB_PAGE +static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } +#endif static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd9a46ccb79c..ed202d29ca46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5702,7 +5702,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, mm, ptep); + migration_entry_wait_huge(vma, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -6927,7 +6927,7 @@ retry: } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait_huge((pte_t *)pmd, ptl); goto retry; } /* diff --git a/mm/migrate.c b/mm/migrate.c index 1d036dec1328..7934eebf1689 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -315,13 +315,28 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) +#ifdef CONFIG_HUGETLB_PAGE +void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); - __migration_entry_wait(mm, pte, ptl); + pte_t pte; + + spin_lock(ptl); + pte = huge_ptep_get(ptep); + + if (unlikely(!is_hugetlb_entry_migration(pte))) + spin_unlock(ptl); + else + migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); } +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) +{ + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); + + __migration_entry_wait_huge(pte, ptl); +} +#endif + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { -- cgit v1.2.3 From c9e124e0382d83d458db204f929002ea98daa6a8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:06 +0000 Subject: mm/damon/{dbgfs,sysfs}: move target_has_pid() from dbgfs to damon.h The function for knowing if given monitoring context's targets will have pid or not is defined and used in dbgfs only. However, the logic is also needed for sysfs. This commit moves the code to damon.h and makes both dbgfs and sysfs to use it. Link: https://lkml.kernel.org/r/20220606182310.48781-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/dbgfs.c | 15 +++++---------- mm/damon/sysfs.c | 8 +++----- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2765c7d99beb..b9aae19fab3e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -525,6 +525,12 @@ bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); +static inline bool damon_target_has_pid(const struct damon_ctx *ctx) +{ + return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR; +} + + int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index a0dab8b5e45f..5ae810927309 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -275,11 +275,6 @@ out: return ret; } -static inline bool target_has_pid(const struct damon_ctx *ctx) -{ - return ctx->ops.id == DAMON_OPS_VADDR; -} - static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; @@ -288,7 +283,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) int rc; damon_for_each_target(t, ctx) { - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) /* Show pid numbers to debugfs users */ id = pid_vnr(t->pid); else @@ -415,7 +410,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) put_pid(t->pid); damon_destroy_target(t); } @@ -425,11 +420,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, if (!t) { damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) dbgfs_put_pids(pids, nr_targets); return -ENOMEM; } - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) t->pid = pids[i]; damon_add_target(ctx, t); } @@ -722,7 +717,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!target_has_pid(ctx)) + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 09f9e8ca3d1f..8810e6abdb06 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2136,8 +2136,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) + if (damon_target_has_pid(ctx)) put_pid(t->pid); damon_destroy_target(t); } @@ -2181,8 +2180,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (!t) return -ENOMEM; - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) { + if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) goto destroy_targets_out; @@ -2210,7 +2208,7 @@ static struct damon_target *damon_sysfs_existing_target( struct pid *pid; struct damon_target *t; - if (ctx->ops.id == DAMON_OPS_PADDR) { + if (!damon_target_has_pid(ctx)) { /* Up to only one target for paddr could exist */ damon_for_each_target(t, ctx) return t; -- cgit v1.2.3 From d9da8f6cf55eeca642c021912af1890002464c64 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 9 Jun 2022 20:18:46 +0200 Subject: mm: introduce clear_highpage_kasan_tagged Add a clear_highpage_kasan_tagged() helper that does clear_highpage() on a page potentially tagged by KASAN. This helper is used by the following patch. Link: https://lkml.kernel.org/r/4471979b46b2c487787ddcd08b9dc5fedd1b6ffd.1654798516.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/highmem.h | 10 ++++++++++ mm/page_alloc.c | 8 ++------ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fee9835e3793..22379a63e293 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page) kunmap_local(kaddr); } +static inline void clear_highpage_kasan_tagged(struct page *page) +{ + u8 tag; + + tag = page_kasan_tag(page); + page_kasan_tag_reset(page); + clear_highpage(page); + page_kasan_tag_set(page, tag); +} + #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9234863f2488..248469134962 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1302,12 +1302,8 @@ static void kernel_init_pages(struct page *page, int numpages) /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) { - u8 tag = page_kasan_tag(page + i); - page_kasan_tag_reset(page + i); - clear_highpage(page + i); - page_kasan_tag_set(page + i, tag); - } + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); kasan_enable_current(); } -- cgit v1.2.3 From c15187a4a2d660bf490f7873afd0de5288f65c8f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:22 -0700 Subject: mm: memcontrol: introduce mem_cgroup_ino() and mem_cgroup_get_from_ino() Patch series "mm: introduce shrinker debugfs interface", v5. The only existing debugging mechanism is a couple of tracepoints in do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end. They aren't covering everything though: shrinkers which report 0 objects will never show up, there is no support for memcg-aware shrinkers. Shrinkers are identified by their scan function, which is not always enough (e.g. hard to guess which super block's shrinker it is having only "super_cache_scan"). To provide a better visibility and debug options for memory shrinkers this patchset introduces a /sys/kernel/debug/shrinker interface, to some extent similar to /sys/kernel/slab. For each shrinker registered in the system a directory is created. As now, the directory will contain only a "scan" file, which allows to get the number of managed objects for each memory cgroup (for memcg-aware shrinkers) and each numa node (for numa-aware shrinkers on a numa machine). Other interfaces might be added in the future. To make debugging more pleasant, the patchset also names all shrinkers, so that debugfs entries can have meaningful names. This patch (of 5): Shrinker debugfs requires a way to represent memory cgroups without using full paths, both for displaying information and getting input from a user. Cgroup inode number is a perfect way, already used by bpf. This commit adds a couple of helper functions which will be used to handle memcg-aware shrinkers. Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Cc: Dave Chinner Cc: Kent Overstreet Cc: Hillf Danton Cc: Christophe JAILLET Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ mm/memcontrol.c | 23 +++++++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 04f2f33607e9..4d31ce55b1c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -837,6 +837,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return memcg ? cgroup_ino(memcg->css.cgroup) : 0; +} + +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); @@ -1343,6 +1352,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + return NULL; +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 655c09393ad5..1497affe08c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5088,6 +5088,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return idr_find(&mem_cgroup_idr, id); } +#ifdef CONFIG_SHRINKER_DEBUG +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + cgrp = cgroup_get_from_id(ino); + if (!cgrp) + return ERR_PTR(-ENOENT); + + css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); + if (css) + memcg = container_of(css, struct mem_cgroup, css); + else + memcg = ERR_PTR(-ENOENT); + + cgroup_put(cgrp); + + return memcg; +} +#endif + static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; -- cgit v1.2.3 From 5035ebc644aec92d55d1bbfe042f35341e4bffb5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:23 -0700 Subject: mm: shrinkers: introduce debugfs interface for memory shrinkers This commit introduces the /sys/kernel/debug/shrinker debugfs interface which provides an ability to observe the state of individual kernel memory shrinkers. Because the feature adds some memory overhead (which shouldn't be large unless there is a huge amount of registered shrinkers), it's guarded by a config option (enabled by default). This commit introduces the "count" interface for each shrinker registered in the system. The output is in the following format: ... ... ... To reduce the size of output on machines with many thousands cgroups, if the total number of objects on all nodes is 0, the line is omitted. If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed as cgroup inode id. If the shrinker is not numa-aware, 0's are printed for all nodes except the first one. This commit gives debugfs entries simple numeric names, which are not very convenient. The following commit in the series will provide shrinkers with more meaningful names. [akpm@linux-foundation.org: remove WARN_ON_ONCE(), per Roman] Reported-by: syzbot+300d27c79fe6d4cbcc39@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20220601032227.4076670-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: Kent Overstreet Acked-by: Muchun Song Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 19 +++++- lib/Kconfig.debug | 9 +++ mm/Makefile | 1 + mm/shrinker_debug.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 6 +- 5 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 mm/shrinker_debug.c (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 76fbf92b04d9..2ced8149c513 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -72,6 +72,10 @@ struct shrinker { #ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; +#endif +#ifdef CONFIG_SHRINKER_DEBUG + int debugfs_id; + struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); -#endif + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern void shrinker_debugfs_remove(struct shrinker *shrinker); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline void shrinker_debugfs_remove(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_SHRINKER_DEBUG */ +#endif /* _LINUX_SHRINKER_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e24db4bff19..0b483a8da409 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -699,6 +699,15 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT help Debug objects boot parameter default value +config SHRINKER_DEBUG + default y + bool "Enable shrinker debugging support" + depends on DEBUG_FS + help + Say Y to enable the shrinker debugfs interface which provides + visibility into the kernel memory shrinkers subsystem. + Disable it to avoid an extra memory footprint. + config HAVE_DEBUG_KMEMLEAK bool diff --git a/mm/Makefile b/mm/Makefile index 6f9ffa968a1a..9a564f836403 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c new file mode 100644 index 000000000000..1a70556bd46c --- /dev/null +++ b/mm/shrinker_debug.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +/* defined in vmscan.c */ +extern struct rw_semaphore shrinker_rwsem; +extern struct list_head shrinker_list; + +static DEFINE_IDA(shrinker_debugfs_ida); +static struct dentry *shrinker_debugfs_root; + +static unsigned long shrinker_count_objects(struct shrinker *shrinker, + struct mem_cgroup *memcg, + unsigned long *count_per_node) +{ + unsigned long nr, total = 0; + int nid; + + for_each_node(nid) { + if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + .nid = nid, + .memcg = memcg, + }; + + nr = shrinker->count_objects(shrinker, &sc); + if (nr == SHRINK_EMPTY) + nr = 0; + } else { + nr = 0; + } + + count_per_node[nid] = nr; + total += nr; + } + + return total; +} + +static int shrinker_debugfs_count_show(struct seq_file *m, void *v) +{ + struct shrinker *shrinker = m->private; + unsigned long *count_per_node; + struct mem_cgroup *memcg; + unsigned long total; + bool memcg_aware; + int ret, nid; + + count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); + if (!count_per_node) + return -ENOMEM; + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); + + memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (memcg && !mem_cgroup_online(memcg)) + continue; + + total = shrinker_count_objects(shrinker, + memcg_aware ? memcg : NULL, + count_per_node); + if (total) { + seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + for_each_node(nid) + seq_printf(m, " %lu", count_per_node[nid]); + seq_putc(m, '\n'); + } + + if (!memcg_aware) { + mem_cgroup_iter_break(NULL, memcg); + break; + } + + if (signal_pending(current)) { + mem_cgroup_iter_break(NULL, memcg); + ret = -EINTR; + break; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + + rcu_read_unlock(); + up_read(&shrinker_rwsem); + + kfree(count_per_node); + return ret; +} +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); + +int shrinker_debugfs_add(struct shrinker *shrinker) +{ + struct dentry *entry; + char buf[16]; + int id; + + lockdep_assert_held(&shrinker_rwsem); + + /* debugfs isn't initialized yet, add debugfs entries later. */ + if (!shrinker_debugfs_root) + return 0; + + id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL); + if (id < 0) + return id; + shrinker->debugfs_id = id; + + snprintf(buf, sizeof(buf), "%d", id); + + /* create debugfs entry */ + entry = debugfs_create_dir(buf, shrinker_debugfs_root); + if (IS_ERR(entry)) { + ida_free(&shrinker_debugfs_ida, id); + return PTR_ERR(entry); + } + shrinker->debugfs_entry = entry; + + debugfs_create_file("count", 0220, entry, shrinker, + &shrinker_debugfs_count_fops); + return 0; +} + +void shrinker_debugfs_remove(struct shrinker *shrinker) +{ + lockdep_assert_held(&shrinker_rwsem); + + if (!shrinker->debugfs_entry) + return; + + debugfs_remove_recursive(shrinker->debugfs_entry); + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); +} + +static int __init shrinker_debugfs_init(void) +{ + struct shrinker *shrinker; + struct dentry *dentry; + int ret = 0; + + dentry = debugfs_create_dir("shrinker", NULL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + shrinker_debugfs_root = dentry; + + /* Create debugfs entries for shrinkers registered at boot */ + down_write(&shrinker_rwsem); + list_for_each_entry(shrinker, &shrinker_list, list) + if (!shrinker->debugfs_entry) { + ret = shrinker_debugfs_add(shrinker); + if (ret) + break; + } + up_write(&shrinker_rwsem); + + return ret; +} +late_initcall(shrinker_debugfs_init); diff --git a/mm/vmscan.c b/mm/vmscan.c index f7d9a683e3a7..35dedff79eb4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task, task->reclaim_state = rs; } -static LIST_HEAD(shrinker_list); -static DECLARE_RWSEM(shrinker_rwsem); +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -650,6 +650,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); } @@ -677,6 +678,7 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); + shrinker_debugfs_remove(shrinker); up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); -- cgit v1.2.3 From e33c267ab70de4249d22d7eab1cc7d68a889bac2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:24 -0700 Subject: mm: shrinkers: provide shrinkers with names Currently shrinkers are anonymous objects. For debugging purposes they can be identified by count/scan function names, but it's not always useful: e.g. for superblock's shrinkers it's nice to have at least an idea of to which superblock the shrinker belongs. This commit adds names to shrinkers. register_shrinker() and prealloc_shrinker() functions are extended to take a format and arguments to master a name. In some cases it's not possible to determine a good name at the time when a shrinker is allocated. For such cases shrinker_debugfs_rename() is provided. The expected format is: -[:]- For some shrinkers an instance can be encoded as (MAJOR:MINOR) pair. After this change the shrinker debugfs directory looks like: $ cd /sys/kernel/debug/shrinker/ $ ls dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42 mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43 mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44 rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49 sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13 sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36 sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19 sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10 sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9 sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37 sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38 sb-dax-11 sb-proc-45 sb-tmpfs-35 sb-debugfs-7 sb-proc-46 sb-tmpfs-40 [roman.gushchin@linux.dev: fix build warnings] Link: https://lkml.kernel.org/r/Yr+ZTnLb9lJk6fJO@castle Reported-by: kernel test robot Link: https://lkml.kernel.org/r/20220601032227.4076670-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/x86/kvm/mmu/mmu.c | 2 +- drivers/android/binder_alloc.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_shrinker.c | 3 +- drivers/gpu/drm/msm/msm_gem_shrinker.c | 2 +- drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 2 +- drivers/md/bcache/btree.c | 2 +- drivers/md/dm-bufio.c | 3 +- drivers/md/dm-zoned-metadata.c | 4 +- drivers/md/raid5.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/xen/xenbus/xenbus_probe_backend.c | 2 +- fs/btrfs/super.c | 2 + fs/erofs/utils.c | 2 +- fs/ext4/extents_status.c | 3 +- fs/f2fs/super.c | 2 +- fs/gfs2/glock.c | 2 +- fs/gfs2/main.c | 2 +- fs/jbd2/journal.c | 3 +- fs/mbcache.c | 2 +- fs/nfs/nfs42xattr.c | 7 +-- fs/nfs/super.c | 2 +- fs/nfsd/filecache.c | 2 +- fs/nfsd/nfscache.c | 3 +- fs/quota/dquot.c | 2 +- fs/super.c | 6 ++- fs/ubifs/super.c | 2 +- fs/xfs/xfs_buf.c | 3 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_qm.c | 3 +- include/linux/shrinker.h | 14 +++++- kernel/rcu/tree.c | 2 +- mm/huge_memory.c | 4 +- mm/shrinker_debug.c | 47 ++++++++++++++++++- mm/vmscan.c | 58 ++++++++++++++++++++++-- mm/workingset.c | 2 +- mm/zsmalloc.c | 3 +- net/sunrpc/auth.c | 2 +- 39 files changed, 167 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..797d3286ecc1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6317,7 +6317,7 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker, "x86-mmu"); if (ret) goto out; diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 5649a0371a1f..51b502217d00 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1084,7 +1084,7 @@ int binder_alloc_shrinker_init(void) int ret = list_lru_init(&binder_alloc_lru); if (ret == 0) { - ret = register_shrinker(&binder_shrinker); + ret = register_shrinker(&binder_shrinker, "android-binder"); if (ret) list_lru_destroy(&binder_alloc_lru); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index 6a6ff98a8746..e43577e03067 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -426,7 +426,8 @@ void i915_gem_driver_register__shrinker(struct drm_i915_private *i915) i915->mm.shrinker.count_objects = i915_gem_shrinker_count; i915->mm.shrinker.seeks = DEFAULT_SEEKS; i915->mm.shrinker.batch = 4096; - drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker)); + drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker, + "drm-i915_gem")); i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom; drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier)); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 086dacf2f26a..26e84d2ea6ae 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -221,7 +221,7 @@ void msm_gem_shrinker_init(struct drm_device *dev) priv->shrinker.count_objects = msm_gem_shrinker_count; priv->shrinker.scan_objects = msm_gem_shrinker_scan; priv->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&priv->shrinker)); + WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem")); priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap; WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier)); diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c index 77e7cb6d1ae3..bf0170782f25 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c @@ -103,7 +103,7 @@ void panfrost_gem_shrinker_init(struct drm_device *dev) pfdev->shrinker.count_objects = panfrost_gem_shrinker_count; pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan; pfdev->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&pfdev->shrinker)); + WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost")); } /** diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 1bba0a0ed3f9..21b61631f73a 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -722,7 +722,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) mm_shrinker.count_objects = ttm_pool_shrinker_count; mm_shrinker.scan_objects = ttm_pool_shrinker_scan; mm_shrinker.seeks = 1; - return register_shrinker(&mm_shrinker); + return register_shrinker(&mm_shrinker, "drm-ttm_pool"); } /** diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e136d6edc1ed..147c493a989a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - if (register_shrinker(&c->shrink)) + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) pr_warn("bcache: %s: could not register shrinker\n", __func__); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 5ffa1dcf84cf..3ff571b20f14 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1806,7 +1806,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker); + r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name, + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); if (r) goto bad; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index d1ea66114d14..46648f6100fb 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2944,7 +2944,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker); + ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); if (ret) { dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); goto err; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d09256d7f81..780ae66840b7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7414,7 +7414,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker); + ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); if (ret) { pr_warn("md/raid:%s: couldn't register shrinker.\n", mdname(mddev)); diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 086ce77d9074..c2d2fa114e65 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1587,7 +1587,7 @@ static int vmballoon_register_shrinker(struct vmballoon *b) b->shrinker.count_objects = vmballoon_shrinker_count; b->shrinker.seeks = DEFAULT_SEEKS; - r = register_shrinker(&b->shrinker); + r = register_shrinker(&b->shrinker, "vmw-balloon"); if (r == 0) b->shrinker_registered = true; diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index b9737da6c4dd..cba57b1f382f 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -875,7 +875,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) vb->shrinker.count_objects = virtio_balloon_shrinker_count; vb->shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&vb->shrinker); + return register_shrinker(&vb->shrinker, "virtio-balloon"); } static int virtballoon_probe(struct virtio_device *vdev) diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 5abded97e1a7..9c09f89d8278 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -305,7 +305,7 @@ static int __init xenbus_probe_backend_init(void) register_xenstore_notifier(&xenstore_notifier); - if (register_shrinker(&backend_memory_shrinker)) + if (register_shrinker(&backend_memory_shrinker, "xen-backend")) pr_warn("shrinker registration failed\n"); return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6627dd7875ee..eee3e96d877f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1815,6 +1815,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + s->s_id); btrfs_sb(s)->bdev_holder = fs_type; if (!strstr(crc32c_impl(), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index ec9a1d780dc1..46627cb69abe 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = { int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info); + return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); } void erofs_exit_shrinker(void) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9a3a8996aacf..23167efda95e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", + sbi->s_sb->s_id); if (err) goto err4; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 37221e94e5ef..bce02306f7a0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4579,7 +4579,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info); + err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c992d53013d3..dca842379cab 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2533,7 +2533,7 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker, "gfs2-glock"); if (ret) { destroy_workqueue(gfs2_delete_workqueue); destroy_workqueue(glock_workqueue); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 244187e3e70f..b66a3e1ec152 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -148,7 +148,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); if (error) goto fail_shrinker; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0cbeeaec2d1..45e4655c8033 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1418,7 +1418,8 @@ static journal_t *journal_init_common(struct block_device *bdev, if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) goto err_cleanup; - if (register_shrinker(&journal->j_shrinker)) { + if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); goto err_cleanup; } diff --git a/fs/mbcache.c b/fs/mbcache.c index 97c54d3a2227..0b833da0a9a5 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -367,7 +367,7 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink)) { + if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { kfree(cache->c_hash); kfree(cache); goto err_out; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index e7b34f7e0614..a9bf09fdf2c3 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void) if (ret) goto out2; - ret = register_shrinker(&nfs4_xattr_cache_shrinker); + ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache"); if (ret) goto out1; - ret = register_shrinker(&nfs4_xattr_entry_shrinker); + ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry"); if (ret) goto out; - ret = register_shrinker(&nfs4_xattr_large_entry_shrinker); + ret = register_shrinker(&nfs4_xattr_large_entry_shrinker, + "nfs-xattr_large_entry"); if (!ret) return 0; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6ab5eeb000dc..82944e14fcea 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -149,7 +149,7 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker); + ret = register_shrinker(&acl_shrinker, "nfs-acl"); if (ret < 0) goto error_3; #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb2d590c036..a605c0e39b09 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -670,7 +670,7 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker); + ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); if (ret) { pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); goto out_lru; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 7da88bdc0d6c..9b31e1103e7b 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker); + status = register_shrinker(&nn->nfsd_reply_cache_shrinker, + "nfsd-reply:%s", nn->nfsd_name); if (status) goto out_stats_destroy; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 09d1307959d0..e0b659900e70 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2995,7 +2995,7 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker)) + if (register_shrinker(&dqcache_shrinker, "dquota-cache")) panic("Cannot register dquot shrinker"); return 0; diff --git a/fs/super.c b/fs/super.c index 60f57c7bc0a6..4fca6657f442 100644 --- a/fs/super.c +++ b/fs/super.c @@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink)) + if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) goto fail; if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; @@ -1288,6 +1288,8 @@ int get_tree_bdev(struct fs_context *fc, } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", + fc->fs_type->name, s->s_id); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, fc); if (error) { @@ -1363,6 +1365,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", + fs_type->name, s->s_id); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0978d01b0ea4..d0c9a09988bc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2430,7 +2430,7 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info); + err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); if (err) goto out_slab; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bf4e60871068..4aa9c9cf5b6e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1986,7 +1986,8 @@ xfs_alloc_buftarg( btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker)) + if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", + mp->m_super->s_id)) goto error_pcpu; return btp; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5269354b1b69..a1941c8b8630 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2201,5 +2201,5 @@ xfs_inodegc_register_shrinker( shrink->flags = SHRINKER_NONSLAB; shrink->batch = XFS_INODEGC_SHRINKER_BATCH; - return register_shrinker(shrink); + return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index abf08bbf34a9..c31d57453ceb 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -677,7 +677,8 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - error = register_shrinker(&qinf->qi_shrinker); + error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", + mp->m_super->s_id); if (error) goto out_free_inos; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 2ced8149c513..08e6054e061f 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -75,6 +75,7 @@ struct shrinker { #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; + const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ @@ -92,9 +93,11 @@ struct shrinker { */ #define SHRINKER_NONSLAB (1 << 3) -extern int prealloc_shrinker(struct shrinker *shrinker); +extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, + const char *fmt, ...); extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int register_shrinker(struct shrinker *shrinker); +extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, + const char *fmt, ...); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); @@ -102,6 +105,8 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); extern void shrinker_debugfs_remove(struct shrinker *shrinker); +extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, + const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { @@ -110,5 +115,10 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) static inline void shrinker_debugfs_remove(struct shrinker *shrinker) { } +static inline __printf(2, 3) +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + return 0; +} #endif /* CONFIG_SHRINKER_DEBUG */ #endif /* _LINUX_SHRINKER_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..4b3bf6ebb1eb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4884,7 +4884,7 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker)) + if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) pr_err("Failed to register kfree_rcu() shrinker!\n"); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9b90a8d7dfa..60d742c33de3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -423,10 +423,10 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker); + err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); if (err) goto err_split_shrinker; diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 1a70556bd46c..781ecbd3d608 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -102,7 +102,7 @@ DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); int shrinker_debugfs_add(struct shrinker *shrinker) { struct dentry *entry; - char buf[16]; + char buf[128]; int id; lockdep_assert_held(&shrinker_rwsem); @@ -116,7 +116,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) return id; shrinker->debugfs_id = id; - snprintf(buf, sizeof(buf), "%d", id); + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id); /* create debugfs entry */ entry = debugfs_create_dir(buf, shrinker_debugfs_root); @@ -131,10 +131,53 @@ int shrinker_debugfs_add(struct shrinker *shrinker) return 0; } +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + struct dentry *entry; + char buf[128]; + const char *new, *old; + va_list ap; + int ret = 0; + + va_start(ap, fmt); + new = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!new) + return -ENOMEM; + + down_write(&shrinker_rwsem); + + old = shrinker->name; + shrinker->name = new; + + if (shrinker->debugfs_entry) { + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, + shrinker->debugfs_id); + + entry = debugfs_rename(shrinker_debugfs_root, + shrinker->debugfs_entry, + shrinker_debugfs_root, buf); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + else + shrinker->debugfs_entry = entry; + } + + up_write(&shrinker_rwsem); + + kfree_const(old); + + return ret; +} +EXPORT_SYMBOL(shrinker_debugfs_rename); + void shrinker_debugfs_remove(struct shrinker *shrinker) { lockdep_assert_held(&shrinker_rwsem); + kfree_const(shrinker->name); + if (!shrinker->debugfs_entry) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 35dedff79eb4..97ac6c6c026d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -608,7 +608,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, /* * Add a shrinker callback to be called from the vm. */ -int prealloc_shrinker(struct shrinker *shrinker) +static int __prealloc_shrinker(struct shrinker *shrinker) { unsigned int size; int err; @@ -632,8 +632,36 @@ int prealloc_shrinker(struct shrinker *shrinker) return 0; } +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) + kfree_const(shrinker->name); + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + void free_prealloced_shrinker(struct shrinker *shrinker) { +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); +#endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); @@ -654,15 +682,39 @@ void register_shrinker_prepared(struct shrinker *shrinker) up_write(&shrinker_rwsem); } -int register_shrinker(struct shrinker *shrinker) +static int __register_shrinker(struct shrinker *shrinker) { - int err = prealloc_shrinker(shrinker); + int err = __prealloc_shrinker(shrinker); if (err) return err; register_shrinker_prepared(shrinker); return 0; } + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) + kfree_const(shrinker->name); + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif EXPORT_SYMBOL(register_shrinker); /* diff --git a/mm/workingset.c b/mm/workingset.c index 592569a8974c..a5e84862fc86 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -625,7 +625,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker); + ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); if (ret) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d5fc04385b8..f24b71568e83 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2217,7 +2217,8 @@ static int zs_register_shrinker(struct zs_pool *pool) pool->shrinker.batch = 0; pool->shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&pool->shrinker); + return register_shrinker(&pool->shrinker, "mm-zspool:%s", + pool->name); } /** diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 682fcd24bf43..04e7b55fe0d9 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -874,7 +874,7 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = register_shrinker(&rpc_cred_shrinker); + err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred"); if (err < 0) goto out2; return 0; -- cgit v1.2.3 From 8cdcc532268df0893d9756f537cbce479f4c4831 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:56 +0000 Subject: mm/damon/schemes: add 'LRU_PRIO' DAMOS action This commit adds a new DAMOS action called 'LRU_PRIO' for the physical address space. The action prioritizes pages in the memory regions of the user-specified target access pattern on their LRU lists. This is hence supposed to be used for frequently accessed (hot) memory regions so that hot pages could be more likely protected under memory pressure. Internally, it simply calls 'mark_page_accessed()'. Link: https://lkml.kernel.org/r/20220613192301.8817-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/ops-common.c | 42 ++++++++++++++++++++++++++++++++++++++++++ mm/damon/ops-common.h | 2 ++ mm/damon/paddr.c | 20 ++++++++++++++++++++ mm/damon/sysfs.c | 1 + 5 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b9aae19fab3e..4c64e03e94d8 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -86,6 +86,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -95,6 +96,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_LRU_PRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 10ef20b2003f..b1335de200e7 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, /* Return coldness of the region */ return DAMOS_MAX_SCORE - hotness; } + +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->aggr_interval / c->sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + return hotness; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index e790cb5f8fe0..52329ff361cd 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 7bcd48066b43..f145b1d51e13 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -233,6 +233,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r) return applied * PAGE_SIZE; } +static unsigned long damon_pa_mark_accessed(struct damon_region *r) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + mark_page_accessed(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -240,6 +256,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_pa_pageout(r); + case DAMOS_LRU_PRIO: + return damon_pa_mark_accessed(r); default: break; } @@ -253,6 +271,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_pageout_score(context, r, scheme); + case DAMOS_LRU_PRIO: + return damon_hot_score(context, r, scheme); default: break; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index c35809c6087c..86c69f980927 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -762,6 +762,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { "pageout", "hugepage", "nohugepage", + "lru_prio", "stat", }; -- cgit v1.2.3 From 99cdc2cd180a7adc87badc9ca92f8af803d8bf3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:58 +0000 Subject: mm/damon/schemes: add 'LRU_DEPRIO' action This commit adds a new DAMON-based operation scheme action called 'LRU_DEPRIO' for physical address space. The action deprioritizes pages in the memory area of the target access pattern on their LRU lists. This is hence supposed to be used for rarely accessed (cold) memory regions so that cold pages could be more likely reclaimed first under memory pressure. Internally, it simply calls 'lru_deactivate()'. Using this with 'LRU_PRIO' action for hot pages, users can proactively sort LRU lists based on the access pattern. That is, it can make the LRU lists somewhat more trustworthy source of access temperature. As a result, efficiency of LRU-lists based mechanisms including the reclamation target selection could be improved. Link: https://lkml.kernel.org/r/20220613192301.8817-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/paddr.c | 20 ++++++++++++++++++++ mm/damon/sysfs.c | 1 + 3 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 4c64e03e94d8..7b1f4a488230 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -87,6 +87,7 @@ struct damon_target { * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. + * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -97,6 +98,7 @@ enum damos_action { DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, + DAMOS_LRU_DEPRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index f145b1d51e13..dc131c6a5403 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -249,6 +249,22 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r) return applied * PAGE_SIZE; } +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + deactivate_page(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -258,6 +274,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return damon_pa_pageout(r); case DAMOS_LRU_PRIO: return damon_pa_mark_accessed(r); + case DAMOS_LRU_DEPRIO: + return damon_pa_deactivate_pages(r); default: break; } @@ -273,6 +291,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context, return damon_pageout_score(context, r, scheme); case DAMOS_LRU_PRIO: return damon_hot_score(context, r, scheme); + case DAMOS_LRU_DEPRIO: + return damon_pageout_score(context, r, scheme); default: break; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 86c69f980927..7488e27c87c3 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -763,6 +763,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { "hugepage", "nohugepage", "lru_prio", + "lru_deprio", "stat", }; -- cgit v1.2.3 From 64fe24a3e05e5f3ac56fcd45afd2fd1d9cc8fcb6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 14 Jun 2022 11:36:29 +0200 Subject: mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection Similar to our MM_CP_DIRTY_ACCT handling for shared, writable mappings, we can try mapping anonymous pages in a private writable mapping writable if they are exclusive, the PTE is already dirty, and no special handling applies. Mapping the anonymous page writable is essentially the same thing the write fault handler would do in this case. Special handling is required for uffd-wp and softdirty tracking, so take care of that properly. Also, leave PROT_NONE handling alone for now; in the future, we could similarly extend the logic in do_numa_page() or use pte_mk_savedwrite() here. While this improves mprotect(PROT_READ)+mprotect(PROT_READ|PROT_WRITE) performance, it should also be a valuable optimization for uffd-wp, when un-protecting. This has been previously suggested by Peter Collingbourne in [1], relevant in the context of the Scudo memory allocator, before we had PageAnonExclusive. This commit doesn't add the same handling for PMDs (i.e., anonymous THP, anonymous hugetlb); benchmark results from Andrea indicate that there are minor performance gains, so it's might still be valuable to streamline that logic for all anonymous pages in the future. As we now also set MM_CP_DIRTY_ACCT for private mappings, let's rename it to MM_CP_TRY_CHANGE_WRITABLE, to make it clearer what's actually happening. Micro-benchmark courtesy of Andrea: === #define _GNU_SOURCE #include #include #include #include #include #define SIZE (1024*1024*1024) int main(int argc, char *argv[]) { char *p; if (posix_memalign((void **)&p, sysconf(_SC_PAGESIZE)*512, SIZE)) perror("posix_memalign"), exit(1); if (madvise(p, SIZE, argc > 1 ? MADV_HUGEPAGE : MADV_NOHUGEPAGE)) perror("madvise"); explicit_bzero(p, SIZE); for (int loops = 0; loops < 40; loops++) { if (mprotect(p, SIZE, PROT_READ)) perror("mprotect"), exit(1); if (mprotect(p, SIZE, PROT_READ|PROT_WRITE)) perror("mprotect"), exit(1); explicit_bzero(p, SIZE); } } === Results on my Ryzen 9 3900X: Stock 10 runs (lower is better): AVG 6.398s, STDEV 0.043 Patched 10 runs (lower is better): AVG 3.780s, STDEV 0.026 === [1] https://lkml.kernel.org/r/20210429214801.2583336-1-pcc@google.com Link: https://lkml.kernel.org/r/20220614093629.76309-1-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Peter Collingbourne Acked-by: Peter Xu Cc: Nadav Amit Cc: Dave Hansen Cc: Andrea Arcangeli Cc: Yang Shi Cc: Hugh Dickins Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++-- mm/mprotect.c | 77 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 68 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cf3d0d673f6b..09ea26056e2f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1962,8 +1962,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, * for now all the callers are only use one of the flags at the same * time. */ -/* Whether we should allow dirty bit accounting */ -#define MM_CP_DIRTY_ACCT (1UL << 0) +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ diff --git a/mm/mprotect.c b/mm/mprotect.c index ba5592655ee3..996a97e213ad 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -38,6 +38,39 @@ #include "internal.h" +static inline bool can_change_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + + if (pte_protnone(pte) || !pte_dirty(pte)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_pte_wp(vma, pte)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* + * We can only special-case on exclusive anonymous pages, + * because we know that our write-fault handler similarly would + * map them writable without any additional checks while holding + * the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + } + + return true; +} + static unsigned long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; - bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, ptent = pte_wrprotect(ptent); ptent = pte_mkuffd_wp(ptent); } else if (uffd_wp_resolve) { - /* - * Leave the write bit to be handled - * by PF interrupt handler, then - * things like COW could be properly - * handled. - */ ptent = pte_clear_uffd_wp(ptent); } - /* Avoid taking write faults for known dirty pages */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) { + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent) && + can_change_pte_writable(vma, addr, ptent)) ptent = pte_mkwrite(ptent); - } + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); if (pte_needs_flush(oldpte, ptent)) tlb_flush_pte_range(tlb, addr, PAGE_SIZE); @@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; + bool try_change_writable; pgoff_t pgoff; int error; - int dirty_accountable = 0; if (newflags == oldflags) { *pprev = vma; @@ -583,11 +621,20 @@ success: * held in write mode. */ vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); + else + try_change_writable = !!(vma->vm_flags & VM_WRITE); vma_set_page_prot(vma); change_protection(tlb, vma, start, end, vma->vm_page_prot, - dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major -- cgit v1.2.3 From b8cecb9376b9d3031cf62b476a0db087b6b01072 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:44 +0100 Subject: mm/vmscan: convert reclaim_clean_pages_from_list() to folios Patch series "nvert much of vmscan to folios" vmscan always operates on folios since it puts the pages on the LRU list. Switching all of these functions from pages to folios saves 1483 bytes of text from removing all the baggage around calling compound_page() and similar functions. This patch (of 5): This is a straightforward conversion which removes several hidden calls to compound_head, saving 330 bytes of kernel text. Link: https://lkml.kernel.org/r/20220617154248.700416-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220617154248.700416-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++++++ mm/vmscan.c | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e66f7aa3191d..f32aade2a6e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -670,6 +670,12 @@ static __always_inline bool PageAnon(struct page *page) return folio_test_anon(page_folio(page)); } +static __always_inline bool __folio_test_movable(const struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == + PAGE_MAPPING_MOVABLE; +} + static __always_inline int __PageMovable(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == diff --git a/mm/vmscan.c b/mm/vmscan.c index 97ac6c6c026d..2ecca45672e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2041,7 +2041,7 @@ keep: } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *page_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2049,16 +2049,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, }; struct reclaim_stat stat; unsigned int nr_reclaimed; - struct page *page, *next; - LIST_HEAD(clean_pages); + struct folio *folio, *next; + LIST_HEAD(clean_folios); unsigned int noreclaim_flag; - list_for_each_entry_safe(page, next, page_list, lru) { - if (!PageHuge(page) && page_is_file_lru(page) && - !PageDirty(page) && !__PageMovable(page) && - !PageUnevictable(page)) { - ClearPageActive(page); - list_move(&page->lru, &clean_pages); + list_for_each_entry_safe(folio, next, folio_list, lru) { + if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && + !folio_test_dirty(folio) && !__folio_test_movable(folio) && + !folio_test_unevictable(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &clean_folios); } } @@ -2069,11 +2069,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); - list_splice(&clean_pages, page_list); + list_splice(&clean_folios, folio_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); /* -- cgit v1.2.3 From e3c4cebf3f9db8c9150eb1982da7e353d9938bed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:49:59 +0100 Subject: mm: add folios_put() Patch series "Convert the swap code to be more folio-based". There's still more to do with the swap code, but this reaps a lot of the folio benefit. More than 4kB of kernel text saved (with the UEK7 kernel config). I don't know how much that's going to translate into CPU savings, but some of those compound_head() calls are on every page free, so it should be noticable. It might even be noticable just from an I-cache consumption perspective. This patch (of 22): This is just a wrapper around release_pages() for now. Place the prototype in mm.h along with folio_put() and folio_put_refs(). Link: https://lkml.kernel.org/r/20220617175020.717127-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220617175020.717127-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 +++++++++++++++++++ include/linux/pagemap.h | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09ea26056e2f..09670ccb94e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1220,6 +1220,25 @@ static inline void folio_put_refs(struct folio *folio, int refs) __put_page(&folio->page); } +void release_pages(struct page **pages, int nr); + +/** + * folios_put - Decrement the reference count on an array of folios. + * @folios: The folios. + * @nr: How many folios there are. + * + * Like folio_put(), but for an array of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which + * need to be taken if the folios are freed. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folios_put(struct folio **folios, unsigned int nr) +{ + release_pages((struct page **)folios, nr); +} + static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ce96866fbec4..c399a9c5da7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -345,8 +345,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -void release_pages(struct page **pages, int nr); - struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -- cgit v1.2.3 From 7d80dd096f8f889128f67a2d452e4dadeed71e63 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:01 +0100 Subject: mm/swap: make __pagevec_lru_add static __pagevec_lru_add has no callers outside swap.c, so make it static, and move it to a more logical position in the file. Link: https://lkml.kernel.org/r/20220617175020.717127-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 1 - mm/swap.c | 126 ++++++++++++++++++++++++------------------------ 2 files changed, 63 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 67b1246f136b..b0e3540f3a4c 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,7 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, pgoff_t *start, pgoff_t end); diff --git a/mm/swap.c b/mm/swap.c index 4265bee41bbd..cab77a5c64c7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -228,6 +228,69 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); +static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) +{ + int was_unevictable = folio_test_clear_unevictable(folio); + long nr_pages = folio_nr_pages(folio); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + folio_set_lru(folio); + /* + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests PageMlocked, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because __munlock_page() only clears PageMlocked while the LRU + * lock is held. + * + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear PageMlocked after + * put_page_testzero() has excluded any other users of the page.) + */ + if (folio_evictable(folio)) { + if (was_unevictable) + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } else { + folio_clear_active(folio); + folio_set_unevictable(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; + if (!was_unevictable) + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + } + + lruvec_add_folio(lruvec, folio); + trace_mm_lru_insertion(folio); +} + +/* + * Add the passed pages to the LRU, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +static void __pagevec_lru_add(struct pagevec *pvec) +{ + int i; + struct lruvec *lruvec = NULL; + unsigned long flags = 0; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct folio *folio = page_folio(pvec->pages[i]); + + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + __pagevec_lru_add_fn(folio, lruvec); + } + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; @@ -1036,69 +1099,6 @@ void __pagevec_release(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_release); -static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) -{ - int was_unevictable = folio_test_clear_unevictable(folio); - long nr_pages = folio_nr_pages(folio); - - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - - folio_set_lru(folio); - /* - * Is an smp_mb__after_atomic() still required here, before - * folio_evictable() tests PageMlocked, to rule out the possibility - * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears PageMlocked while the LRU - * lock is held. - * - * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear PageMlocked after - * put_page_testzero() has excluded any other users of the page.) - */ - if (folio_evictable(folio)) { - if (was_unevictable) - __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); - } else { - folio_clear_active(folio); - folio_set_unevictable(folio); - /* - * folio->mlock_count = !!folio_test_mlocked(folio)? - * But that leaves __mlock_page() in doubt whether another - * actor has already counted the mlock or not. Err on the - * safe side, underestimate, let page reclaim fix it, rather - * than leaving a page on the unevictable LRU indefinitely. - */ - folio->mlock_count = 0; - if (!was_unevictable) - __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); - } - - lruvec_add_folio(lruvec, folio); - trace_mm_lru_insertion(folio); -} - -/* - * Add the passed pages to the LRU, then drop the caller's refcount - * on them. Reinitialises the caller's pagevec. - */ -void __pagevec_lru_add(struct pagevec *pvec) -{ - int i; - struct lruvec *lruvec = NULL; - unsigned long flags = 0; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct folio *folio = page_folio(pvec->pages[i]); - - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - __pagevec_lru_add_fn(folio, lruvec); - } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); -} - /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune -- cgit v1.2.3 From 8d29c7036f5ff360ea1f51b9fed5d909be7c8094 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:13 +0100 Subject: mm/swap: convert __put_page() to __folio_put() Saves 11 bytes of text by removing a check of PageTail. Link: https://lkml.kernel.org/r/20220617175020.717127-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- mm/swap.c | 14 +++++++------- net/core/page_pool.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09670ccb94e7..3fb49aec13fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -855,7 +855,7 @@ static inline struct folio *virt_to_folio(const void *x) return page_folio(page); } -void __put_page(struct page *page); +void __folio_put(struct folio *folio); void put_pages_list(struct list_head *pages); @@ -1197,7 +1197,7 @@ static inline __must_check bool try_get_page(struct page *page) static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) - __put_page(&folio->page); + __folio_put(folio); } /** @@ -1217,7 +1217,7 @@ static inline void folio_put(struct folio *folio) static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) - __put_page(&folio->page); + __folio_put(folio); } void release_pages(struct page **pages, int nr); diff --git a/mm/swap.c b/mm/swap.c index a5a91aec83da..d09e9ac53809 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -119,16 +119,16 @@ static void __put_compound_page(struct page *page) destroy_compound_page(page); } -void __put_page(struct page *page) +void __folio_put(struct folio *folio) { - if (unlikely(is_zone_device_page(page))) - free_zone_device_page(page); - else if (unlikely(PageCompound(page))) - __put_compound_page(page); + if (unlikely(folio_is_zone_device(folio))) + free_zone_device_page(&folio->page); + else if (unlikely(folio_test_large(folio))) + __put_compound_page(&folio->page); else - __put_single_page(page); + __put_single_page(&folio->page); } -EXPORT_SYMBOL(__put_page); +EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f18e6e771993..db70e94c8df2 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -16,7 +16,7 @@ #include #include #include -#include /* for __put_page() */ +#include /* for put_page() */ #include #include -- cgit v1.2.3 From 5375336c8c42a343c3b440b6f1e21c65e7b174b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:17 +0100 Subject: mm: convert destroy_compound_page() to destroy_large_folio() All callers now have a folio, so push the folio->page conversion down to this function. [akpm@linux-foundation.org: uninline destroy_large_folio() to fix build issue] Link: https://lkml.kernel.org/r/20220617175020.717127-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +----- mm/page_alloc.c | 8 ++++++++ mm/swap.c | 2 +- mm/vmscan.c | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3fb49aec13fd..9cc02a7e503b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -892,11 +892,7 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline void destroy_compound_page(struct page *page) -{ - VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - compound_page_dtors[page[1].compound_dtor](page); -} +void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 248469134962..52fd92b2c1fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -744,6 +744,14 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } +void destroy_large_folio(struct folio *folio) +{ + enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); + compound_page_dtors[dtor](&folio->page); +} + #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; diff --git a/mm/swap.c b/mm/swap.c index 5f6caa651599..1f563d857768 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -115,7 +115,7 @@ static void __folio_put_large(struct folio *folio) */ if (!folio_test_hugetlb(folio)) __page_cache_release(folio); - destroy_compound_page(&folio->page); + destroy_large_folio(folio); } void __folio_put(struct folio *folio) diff --git a/mm/vmscan.c b/mm/vmscan.c index e7d3db64a4e0..e660d7205f47 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1979,7 +1979,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) - destroy_compound_page(&folio->page); + destroy_large_folio(folio); else list_add(&folio->lru, &free_pages); continue; @@ -2348,7 +2348,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, if (unlikely(folio_test_large(folio))) { spin_unlock_irq(&lruvec->lru_lock); - destroy_compound_page(&folio->page); + destroy_large_folio(folio); spin_lock_irq(&lruvec->lru_lock); } else list_add(&folio->lru, &folios_to_free); -- cgit v1.2.3 From ed7802dd48f7a507213cbb95bb4c6f1fe134eb5d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 17 Jun 2022 21:56:49 +0800 Subject: mm: memory_hotplug: enumerate all supported section flags Patch series "make hugetlb_optimize_vmemmap compatible with memmap_on_memory", v3. This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory. This patch (of 2): We are almost running out of section flags, only one bit is available in the worst case (powerpc with 256k pages). However, there are still some free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has 10 bits available, arm64 has 8 bits available with worst case of 64K pages). We have hard coded those numbers in code, it is inconvenient to use those bits on other architectures except powerpc. So transfer those section flags to enumeration to make it easy to add new section flags in the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of CONFIG_ZONE_DEVICE to save a bit on non-zone-device case. [songmuchun@bytedance.com: replace enum with defines per David] Link: https://lkml.kernel.org/r/20220620110616.12056-2-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Oscar Salvador Cc: Paul E. McKenney Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 41 ++++++++++++++++++++++++++++++++--------- mm/memory_hotplug.c | 6 ++++++ mm/sparse.c | 2 +- 3 files changed, 39 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aab70355d64f..2b5757752333 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1418,16 +1418,32 @@ extern size_t mem_section_usage_size(void); * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available. + * To sum it up, at least 6 bits are available on all architectures. + * However, we can exceed 6 bits on some other architectures except + * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available + * with the worst case of 64K pages on arm64) if we make sure the + * exceeded bit is not applicable to powerpc. */ -#define SECTION_MARKED_PRESENT (1UL<<0) -#define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_IS_EARLY (1UL<<3) -#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) -#define SECTION_MAP_LAST_BIT (1UL<<5) -#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 6 +enum { + SECTION_MARKED_PRESENT_BIT, + SECTION_HAS_MEM_MAP_BIT, + SECTION_IS_ONLINE_BIT, + SECTION_IS_EARLY_BIT, +#ifdef CONFIG_ZONE_DEVICE + SECTION_TAINT_ZONE_DEVICE_BIT, +#endif + SECTION_MAP_LAST_BIT, +}; + +#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) +#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) +#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) +#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) +#ifdef CONFIG_ZONE_DEVICE +#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) +#endif +#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) +#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1466,12 +1482,19 @@ static inline int online_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } +#ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } +#else +static inline int online_device_section(struct mem_section *section) +{ + return 0; +} +#endif static inline int online_section_nr(unsigned long nr) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 84990a14d51a..a2a6d280054f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -670,12 +670,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon } +#ifdef CONFIG_ZONE_DEVICE static void section_taint_zone_device(unsigned long pfn) { struct mem_section *ms = __pfn_to_section(pfn); ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; } +#else +static inline void section_taint_zone_device(unsigned long pfn) +{ +} +#endif /* * Associate the pfn range with the given zone, initializing the memmaps diff --git a/mm/sparse.c b/mm/sparse.c index cb3bfae64036..e5a8a3a0edd7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p { unsigned long coded_mem_map = (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); - BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL< PFN_SECTION_SHIFT); BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); return coded_mem_map; } -- cgit v1.2.3 From 66361095129b3b5d065e6c09cf0c085ef4a8c40f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 17 Jun 2022 21:56:50 +0800 Subject: mm: memory_hotplug: make hugetlb_optimize_vmemmap compatible with memmap_on_memory For now, the feature of hugetlb_free_vmemmap is not compatible with the feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap takes precedence over memory_hotplug.memmap_on_memory. However, someone wants to make memory_hotplug.memmap_on_memory takes precedence over hugetlb_free_vmemmap since memmap_on_memory makes it more likely to succeed memory hotplug in close-to-OOM situations. So the decision of making hugetlb_free_vmemmap take precedence is not wise and elegant. The proper approach is to have hugetlb_vmemmap.c do the check whether the section which the HugeTLB pages belong to can be optimized. If the section's vmemmap pages are allocated from the added memory block itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap, otherwise, do the optimization. Then both kernel parameters are compatible. So this patch introduces VmemmapSelfHosted to mask any non-optimizable vmemmap pages. The hugetlb_vmemmap can use this flag to detect if a vmemmap page can be optimized. [songmuchun@bytedance.com: walk vmemmap page tables to avoid false-positive] Link: https://lkml.kernel.org/r/20220620110616.12056-3-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Co-developed-by: Oscar Salvador Signed-off-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 22 ++++----- Documentation/admin-guide/sysctl/vm.rst | 5 +- include/linux/memory_hotplug.h | 9 ---- include/linux/page-flags.h | 11 +++++ mm/hugetlb_vmemmap.c | 66 ++++++++++++++++++++++--- mm/memory_hotplug.c | 27 +++++----- 6 files changed, 93 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8c0ea6b6c6a9..2cacd4f8deb7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1722,9 +1722,11 @@ Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. - This is not compatible with memory_hotplug.memmap_on_memory. - If both parameters are enabled, hugetlb_free_vmemmap takes - precedence over memory_hotplug.memmap_on_memory. + Note that the vmemmap pages may be allocated from the added + memory block itself when memory_hotplug.memmap_on_memory is + enabled, those vmemmap pages cannot be optimized even if this + feature is enabled. Other vmemmap pages not allocated from + the added memory block itself do not be affected. hung_task_panic= [KNL] Should the hung task detector generate panics. @@ -3068,10 +3070,12 @@ [KNL,X86,ARM] Boolean flag to enable this feature. Format: {on | off (default)} When enabled, runtime hotplugged memory will - allocate its internal metadata (struct pages) - from the hotadded memory which will allow to - hotadd a lot of memory without requiring - additional memory to do so. + allocate its internal metadata (struct pages, + those vmemmap pages cannot be optimized even + if hugetlb_free_vmemmap is enabled) from the + hotadded memory which will allow to hotadd a + lot of memory without requiring additional + memory to do so. This feature is disabled by default because it has some implication on large (e.g. GB) allocations in some configurations (e.g. small @@ -3081,10 +3085,6 @@ Note that even when enabled, there are a few cases where the feature is not effective. - This is not compatible with hugetlb_free_vmemmap. If - both parameters are enabled, hugetlb_free_vmemmap takes - precedence over memory_hotplug.memmap_on_memory. - memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest Format: default : 0 diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 4a440a7cfeb0..f74f722ad702 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst hugetlb_optimize_vmemmap ======================== -This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) -is configured or the size of 'struct page' (a structure defined in -include/linux/mm_types.h) is not power of two (an unusual system config could +This knob is not available when the size of 'struct page' (a structure defined +in include/linux/mm_types.h) is not power of two (an unusual system config could result in this). Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 20d7edf62a6a..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -bool mhp_memmap_on_memory(void); -#else -static inline bool mhp_memmap_on_memory(void) -{ - return false; -} -#endif - #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f32aade2a6e0..82719d33c0f1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -193,6 +193,11 @@ enum pageflags { /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, + +#ifdef CONFIG_MEMORY_HOTPLUG + /* For self-hosted memmap pages */ + PG_vmemmap_self_hosted = PG_owner_priv_1, +#endif }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -628,6 +633,12 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_MEMORY_HOTPLUG +PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) +#else +PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba29c15c53d6..1362feb3c6c9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -10,7 +10,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include +#include #include "hugetlb_vmemmap.h" /* @@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } +static unsigned int vmemmap_optimizable_pages(struct hstate *h, + struct page *head) +{ + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return 0; + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + pmd_t *pmdp, pmd; + struct page *vmemmap_page; + unsigned long vaddr = (unsigned long)head; + + /* + * Only the vmemmap page's vmemmap page can be self-hosted. + * Walking the page tables to find the backing page of the + * vmemmap page. + */ + pmdp = pmd_off_k(vaddr); + /* + * The READ_ONCE() is used to stabilize *pmdp in a register or + * on the stack so that it will stop changing under the code. + * The only concurrent operation where it can be changed is + * split_vmemmap_huge_pmd() (*pmdp will be stable after this + * operation). + */ + pmd = READ_ONCE(*pmdp); + if (pmd_leaf(pmd)) + vmemmap_page = pmd_page(pmd) + pte_index(vaddr); + else + vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); + /* + * Due to HugeTLB alignment requirements and the vmemmap pages + * being at the start of the hotplugged memory region in + * memory_hotplug.memmap_on_memory case. Checking any vmemmap + * page's vmemmap page if it is marked as VmemmapSelfHosted is + * sufficient. + * + * [ hotplugged memory ] + * [ section ][...][ section ] + * [ vmemmap ][ usable memory ] + * ^ | | | + * +---+ | | + * ^ | | + * +-------+ | + * ^ | + * +-------------------------------------------+ + */ + if (PageVmemmapSelfHosted(vmemmap_page)) + return 0; + } + + return hugetlb_optimize_vmemmap_pages(h); +} + void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_pages = vmemmap_optimizable_pages(h, head); if (!vmemmap_pages) return; - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) - return; - static_branch_inc(&hugetlb_optimize_vmemmap_key); vmemmap_addr += RESERVE_VMEMMAP_SIZE; @@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = { static __init int hugetlb_vmemmap_sysctls_init(void) { /* - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" - * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "struct page" crosses page boundaries, the vmemmap pages cannot + * be optimized. */ - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page))) + if (is_power_of_2(sizeof(struct page))) register_sysctl_init("vm", hugetlb_vmemmap_sysctls); return 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2a6d280054f..99ecb2b3ff53 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -43,30 +43,22 @@ #include "shuffle.h" #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -static int memmap_on_memory_set(const char *val, const struct kernel_param *kp) -{ - if (hugetlb_optimize_vmemmap_enabled()) - return 0; - return param_set_bool(val, kp); -} - -static const struct kernel_param_ops memmap_on_memory_ops = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = memmap_on_memory_set, - .get = param_get_bool, -}; - /* * memory_hotplug.memmap_on_memory parameter */ static bool memmap_on_memory __ro_after_init; -module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444); +module_param(memmap_on_memory, bool, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); -bool mhp_memmap_on_memory(void) +static inline bool mhp_memmap_on_memory(void) { return memmap_on_memory; } +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} #endif enum { @@ -1035,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; - int ret; + int ret, i; ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); if (ret) @@ -1043,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + for (i = 0; i < nr_pages; i++) + SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + /* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections online here as otherwise they will be -- cgit v1.2.3 From e8da368a1e42a8056d1a6b419e1b91b6cf11d77e Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Mon, 20 Jun 2022 07:15:16 +0000 Subject: mm, docs: fix comments that mention mem_hotplug_end() Comments that mention mem_hotplug_end() are confusing as there is no function called mem_hotplug_end(). Fix them by replacing all the occurences of mem_hotplug_end() in the comments with mem_hotplug_done(). [akpm@linux-foundation.org: grammatical fixes] Link: https://lkml.kernel.org/r/20220620071516.1286101-1-p76091292@gs.ncku.edu.tw Signed-off-by: Yun-Ze Li Cc: Souptick Joarder Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 6 +++--- mm/compaction.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2b5757752333..735bf5b37949 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -591,8 +591,8 @@ struct zone { * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by - * mem_hotplug_begin/end(). Any reader who can't tolerant drift of - * present_pages should get_online_mems() to get a stable value. + * mem_hotplug_begin/done(). Any reader who can't tolerant drift of + * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; @@ -870,7 +870,7 @@ typedef struct pglist_data { unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/end() */ + mem_hotplug_begin/done() */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; diff --git a/mm/compaction.c b/mm/compaction.c index 1f89b969c12b..cd029ab03d0e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3011,7 +3011,7 @@ void kcompactd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold mem_hotplug_begin/end(). + * be holding mem_hotplug_begin/done(). */ void kcompactd_stop(int nid) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 161096d9311a..f58761cea0a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4645,7 +4645,7 @@ void kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold mem_hotplug_begin/end(). + * be holding mem_hotplug_begin/done(). */ void kswapd_stop(int nid) { -- cgit v1.2.3 From 18f3962953e40401b7ed98e8524167282c3e626e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 26 Jun 2022 22:57:17 +0800 Subject: mm: hugetlb: kill set_huge_swap_pte_at() Commit e5251fd43007 ("mm/hugetlb: introduce set_huge_swap_pte_at() helper") add set_huge_swap_pte_at() to handle swap entries on architectures that support hugepages consisting of contiguous ptes. And currently the set_huge_swap_pte_at() is only overridden by arm64. set_huge_swap_pte_at() provide a sz parameter to help determine the number of entries to be updated. But in fact, all hugetlb swap entries contain pfn information, so we can find the corresponding folio through the pfn recorded in the swap entry, then the folio_size() is the number of entries that need to be updated. And considering that users will easily cause bugs by ignoring the difference between set_huge_swap_pte_at() and set_huge_pte_at(). Let's handle swap entries in set_huge_pte_at() and remove the set_huge_swap_pte_at(), then we can call set_huge_pte_at() anywhere, which simplifies our coding. Link: https://lkml.kernel.org/r/20220626145717.53572-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Mike Kravetz Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 3 --- arch/arm64/mm/hugetlbpage.c | 34 +++++++++++++++++----------------- include/linux/hugetlb.h | 13 ------------- mm/hugetlb.c | 8 +++----- mm/rmap.c | 11 +++-------- 5 files changed, 23 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1fd2846dbefe..d20f5da2d76f 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -46,9 +46,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET extern pte_t huge_ptep_get(pte_t *ptep); -extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz); -#define set_huge_swap_pte_at set_huge_swap_pte_at void __init arm64_hugetlb_cma_reserve(void); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index e2a5ec9fdc0d..3be8f25aa5be 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -238,6 +238,13 @@ static void clear_flush(struct mm_struct *mm, flush_tlb_range(&vma, saddr, addr); } +static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry) +{ + VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry)); + + return page_folio(pfn_to_page(swp_offset(entry))); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -247,11 +254,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, unsigned long pfn, dpfn; pgprot_t hugeprot; - /* - * Code needs to be expanded to handle huge swap and migration - * entries. Needed for HUGETLB and MEMORY_FAILURE. - */ - WARN_ON(!pte_present(pte)); + if (!pte_present(pte)) { + struct folio *folio; + + folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte)); + ncontig = num_contig_ptes(folio_size(folio), &pgsize); + + for (i = 0; i < ncontig; i++, ptep++) + set_pte_at(mm, addr, ptep, pte); + return; + } if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); @@ -269,18 +281,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } -void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ - int i, ncontig; - size_t pgsize; - - ncontig = num_contig_ptes(sz, &pgsize); - - for (i = 0; i < ncontig; i++, ptep++) - set_pte(ptep, pte); -} - pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 756b66ff025e..c6cccfaf8708 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -903,14 +903,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) atomic_long_sub(l, &mm->hugetlb_usage); } -#ifndef set_huge_swap_pte_at -static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ - set_huge_pte_at(mm, addr, ptep, pte); -} -#endif - #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, @@ -1094,11 +1086,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } -static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ -} - static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65454896f174..064da8ffbac6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4798,12 +4798,11 @@ again: entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = huge_pte_mkuffd_wp(entry); - set_huge_swap_pte_at(src, addr, src_pte, - entry, sz); + set_huge_pte_at(src, addr, src_pte, entry); } if (!userfaultfd_wp(dst_vma) && uffd_wp) entry = huge_pte_clear_uffd_wp(entry); - set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { /* * We copy the pte marker only if the dst vma has @@ -6344,8 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); - set_huge_swap_pte_at(mm, address, ptep, - newpte, psize); + set_huge_pte_at(mm, address, ptep, newpte); pages++; } spin_unlock(ptl); diff --git a/mm/rmap.c b/mm/rmap.c index 56134cdc5ca3..83172ee0ea35 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1618,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_swap_pte_at(mm, address, - pvmw.pte, pteval, - vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, pteval); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -2004,9 +2002,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_swap_pte_at(mm, address, - pvmw.pte, pteval, - vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, pteval); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -2074,8 +2070,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); if (folio_test_hugetlb(folio)) - set_huge_swap_pte_at(mm, address, pvmw.pte, - swp_pte, vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, swp_pte); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), -- cgit v1.2.3 From 1baec203b77cafa24610b5c9ae7a2aa380d74ef6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:16 +0800 Subject: mm/khugepaged: try to free transhuge swapcache when possible Transhuge swapcaches won't be freed in __collapse_huge_page_copy(). It's because release_pte_page() is not called for these pages and thus free_page_and_swap_cache can't grab the page lock. These pages won't be freed from swap cache even if we are the only user until next time reclaim. It shouldn't hurt indeed, but we could try to free these pages to save more memory for system. Link: https://lkml.kernel.org/r/20220625092816.4856-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ mm/khugepaged.c | 7 ++++++- mm/swap.h | 5 ----- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 95a5b7aa1ae9..6d11c51b2b62 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -455,6 +455,7 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } +extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); /* linux/mm/swapfile.c */ @@ -539,6 +540,10 @@ static inline void put_swap_device(struct swap_info_struct *si) /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) +static inline void free_swap_cache(struct page *page) +{ +} + static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 08e885f28def..01e0d6336754 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -755,7 +755,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { list_del(&src_page->lru); - release_pte_page(src_page); + mod_node_page_state(page_pgdat(src_page), + NR_ISOLATED_ANON + page_is_file_lru(src_page), + -compound_nr(src_page)); + unlock_page(src_page); + free_swap_cache(src_page); + putback_lru_page(src_page); } } diff --git a/mm/swap.h b/mm/swap.h index fa0816af4712..17936e068c1c 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,7 +41,6 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void free_swap_cache(struct page *page); struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); @@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline void free_swap_cache(struct page *page) -{ -} - static inline void show_swap_cache_info(void) { } -- cgit v1.2.3 From 6077c943beee407168f72ece745b0aeaef6b896f Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:08 -0500 Subject: mm: rename is_pinnable_page() to is_longterm_pinnable_page() Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory mapping", v9. This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory owned by a device that can be mapped into CPU page tables like MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE. This patch series is mostly self-contained except for a few places where it needs to update other subsystems to handle the new memory type. System stability and performance are not affected according to our ongoing testing, including xfstests. How it works: The system BIOS advertises the GPU device memory (aka VRAM) as SPM (special purpose memory) in the UEFI system address map. The amdgpu driver registers the memory with devmap as MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for this hardware page migration capability is the Frontier supercomputer project. This functionality is not AMD-specific. We expect other GPU vendors to find this functionality useful, and possibly other hardware types in the future. Our test nodes in the lab are similar to the Frontier configuration, with .5 TB of system memory plus 256 GB of device memory split across 4 GPUs, all in a single coherent address space. Page migration is expected to improve application efficiency significantly. We will report empirical results as they become available. Coherent device type pages at gup are now migrated back to system memory if they are being pinned long-term (FOLL_LONGTERM). The reason is, that long-term pinning would interfere with the device memory manager owning the device-coherent pages (e.g. evictions in TTM). These series incorporate Alistair Popple patches to do this migration from pin_user_pages() calls. hmm_gup_test has been added to hmm-test to test different get user pages calls. This series includes handling of device-managed anonymous pages returned by vm_normal_pages. Although they behave like normal pages for purposes of mapping in CPU page tables and for COW, they do not support LRU lists, NUMA migration or THP. We also introduced a FOLL_LRU flag that adds the same behaviour to follow_page and related APIs, to allow callers to specify that they expect to put pages on an LRU list. This patch (of 14): is_pinnable_page() and folio_is_pinnable() are renamed to is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively. These functions are used in the FOLL_LONGTERM flag context. Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.com Signed-off-by: Alex Sierra Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Felix Kuehling Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jerome Glisse Cc: Alistair Popple Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- mm/gup.c | 4 ++-- mm/gup_test.c | 2 +- mm/hugetlb.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9cc02a7e503b..3c044e38958c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1607,7 +1607,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION -static inline bool is_pinnable_page(struct page *page) +static inline bool is_longterm_pinnable_page(struct page *page) { #ifdef CONFIG_CMA int mt = get_pageblock_migratetype(page); @@ -1618,15 +1618,15 @@ static inline bool is_pinnable_page(struct page *page) return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); } #else -static inline bool is_pinnable_page(struct page *page) +static inline bool is_longterm_pinnable_page(struct page *page) { return true; } #endif -static inline bool folio_is_pinnable(struct folio *folio) +static inline bool folio_is_longterm_pinnable(struct folio *folio) { - return is_pinnable_page(&folio->page); + return is_longterm_pinnable_page(&folio->page); } static inline void set_page_zone(struct page *page, enum zone_type zone) diff --git a/mm/gup.c b/mm/gup.c index 3129b754ade3..a9940e3b3181 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * path. */ if (unlikely((flags & FOLL_LONGTERM) && - !is_pinnable_page(page))) + !is_longterm_pinnable_page(page))) return NULL; /* @@ -1923,7 +1923,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - if (folio_is_pinnable(folio)) + if (folio_is_longterm_pinnable(folio)) continue; /* diff --git a/mm/gup_test.c b/mm/gup_test.c index d974dec19e1c..12b0a91767d3 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; } else if (cmd == PIN_LONGTERM_BENCHMARK && - WARN(!is_pinnable_page(page), + WARN(!is_longterm_pinnable_page(page), "pages[%lu] is NOT pinnable but pinned\n", i)) { dump_page(page, "gup_test failure"); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 064da8ffbac6..ffdf3fc4a83f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1129,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_pinnable_page(page)) + if (pin && !is_longterm_pinnable_page(page)) continue; if (PageHWPoison(page)) -- cgit v1.2.3 From 5bb88dc571b1cbf0284100a317fb21ab7d03e40c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:09 -0500 Subject: mm: move page zone helpers from mm.h to mmzone.h It makes more sense to have these helpers in zone specific header file, rather than the generic mm.h Link: https://lkml.kernel.org/r/20220715150521.18165-3-alex.sierra@amd.com Signed-off-by: Alex Sierra Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/memremap.h | 2 +- include/linux/mm.h | 78 ---------------------------------------------- include/linux/mmzone.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 79 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 9f5ee49482de..732dde5988fb 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -2,7 +2,7 @@ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ -#include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c044e38958c..a2d01e49253b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1045,84 +1045,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); * back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ - -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) -#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) -#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) - -/* - * Define the bit shifts to access each section. For non-existent - * sections we define the shift as 0; that plus a 0 mask ensures - * the compiler will optimise away reference to them. - */ -#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) -#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) - -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ -#ifdef NODE_NOT_IN_PAGE_FLAGS -#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ - SECTIONS_PGOFF : ZONES_PGOFF) -#else -#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ - NODES_PGOFF : ZONES_PGOFF) -#endif - -#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) - -#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) -#define NODES_MASK ((1UL << NODES_WIDTH) - 1) -#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) -#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) -#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) - -static inline enum zone_type page_zonenum(const struct page *page) -{ - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; -} - -static inline enum zone_type folio_zonenum(const struct folio *folio) -{ - return page_zonenum(&folio->page); -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_DEVICE; -} -extern void memmap_init_zone_device(struct zone *, unsigned long, - unsigned long, struct dev_pagemap *); -#else -static inline bool is_zone_device_page(const struct page *page) -{ - return false; -} -#endif - -static inline bool folio_is_zone_device(const struct folio *folio) -{ - return is_zone_device_page(&folio->page); -} - -static inline bool is_zone_movable_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_MOVABLE; -} - #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 735bf5b37949..5da1135e6755 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +#ifndef BUILD_VDSO32_64 +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ + +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) +#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) + +/* + * Define the bit shifts to access each section. For non-existent + * sections we define the shift as 0; that plus a 0 mask ensures + * the compiler will optimise away reference to them. + */ +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) +#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) + +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ +#ifdef NODE_NOT_IN_PAGE_FLAGS +#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ + SECTIONS_PGOFF : ZONES_PGOFF) +#else +#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ + NODES_PGOFF : ZONES_PGOFF) +#endif + +#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) + +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) +#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) +#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) + +static inline enum zone_type page_zonenum(const struct page *page) +{ + ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); + return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; +} + +static inline enum zone_type folio_zonenum(const struct folio *folio) +{ + return page_zonenum(&folio->page); +} + +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_zone_device_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_DEVICE; +} +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); +#else +static inline bool is_zone_device_page(const struct page *page) +{ + return false; +} +#endif + +static inline bool folio_is_zone_device(const struct folio *folio) +{ + return is_zone_device_page(&folio->page); +} + +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} +#endif + /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone -- cgit v1.2.3 From f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:10 -0500 Subject: mm: add zone device coherent type memory support Device memory that is cache coherent from device and CPU point of view. This is used on platforms that have an advanced system bus (like CAPI or CXL). Any page of a process can be migrated to such memory. However, no one should be allowed to pin such memory so that it can always be evicted. [hch@lst.de: rebased ontop of the refcount changes, remove is_dev_private_or_coherent_page] Link: https://lkml.kernel.org/r/20220715150521.18165-4-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/memremap.h | 19 +++++++++++++++++++ include/linux/mm.h | 5 ++++- mm/memcontrol.c | 7 ++++--- mm/memory-failure.c | 8 ++++++-- mm/memremap.c | 10 ++++++++++ mm/migrate_device.c | 16 +++++++--------- mm/rmap.c | 5 +++-- 7 files changed, 53 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 732dde5988fb..09320b7f706c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -41,6 +41,13 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/mm/hmm.rst. * + * MEMORY_DEVICE_COHERENT: + * Device memory that is cache coherent from device and CPU point of view. This + * is used on platforms that have an advanced system bus (like CAPI or CXL). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allowed to pin such memory so that it can always be evicted. + * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -61,6 +68,7 @@ struct vmem_altmap { enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, + MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, @@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline bool is_device_coherent_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_COHERENT; +} + +static inline bool folio_is_device_coherent(const struct folio *folio) +{ + return is_device_coherent_page(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index a2d01e49253b..64393ed3330a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -28,6 +28,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); + return !(is_device_coherent_page(page) || + is_zone_movable_page(page) || + is_zero_pfn(page_to_pfn(page))); } #else static inline bool is_longterm_pinnable_page(struct page *page) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1497affe08c4..b1868784f895 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5716,8 +5716,8 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5755,7 +5755,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f7612ccdb299..b7ca5db7e60e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1686,12 +1686,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto unlock; } - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: /* - * TODO: Handle HMM pages which may need coordination + * TODO: Handle device pages which may need coordination * with device-side memory. */ goto unlock; + default: + break; } /* diff --git a/mm/memremap.c b/mm/memremap.c index 8b5c8fd4ea8e..f0955785150f 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; + case MEMORY_DEVICE_COHERENT: + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; case MEMORY_DEVICE_FS_DAX: if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { WARN(1, "File system DAX not supported\n"); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5052093d0262..a4847ad65da3 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -518,7 +518,7 @@ EXPORT_SYMBOL(migrate_vma_setup); * handle_pte_fault() * do_anonymous_page() * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * private or coherent page. */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, @@ -594,11 +594,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { - /* - * For now we only support migrating to un-addressable device - * memory. - */ - if (is_zone_device_page(page)) { + if (is_zone_device_page(page) && + !is_device_coherent_page(page)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } @@ -701,10 +698,11 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); - if (is_device_private_page(newpage)) { + if (is_device_private_page(newpage) || + is_device_coherent_page(newpage)) { /* - * For now only support private anonymous when migrating - * to un-addressable device memory. + * For now only support anonymous memory migrating to + * device private or coherent memory. */ if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; diff --git a/mm/rmap.c b/mm/rmap.c index 83172ee0ea35..0532fd92ecb3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1953,7 +1953,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (folio_is_zone_device(folio)) { + if (folio_is_device_private(folio)) { unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; @@ -2124,7 +2124,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) TTU_SYNC))) return; - if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) + if (folio_is_zone_device(folio) && + (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* -- cgit v1.2.3 From dd19e6d8ffaa1289d75d7833de97faf1b6b2c8e4 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:12 -0500 Subject: mm: add device coherent vma selection for memory migration This case is used to migrate pages from device memory, back to system memory. Device coherent type memory is cache coherent from device and CPU point of view. Link: https://lkml.kernel.org/r/20220715150521.18165-6-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 + mm/migrate_device.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 069a89e847f3..b84908debe5c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, + MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, }; struct migrate_vma { diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a4847ad65da3..18bc6483f63a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -148,15 +148,21 @@ again: if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { + if (is_zero_pfn(pfn) && + (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; goto next; } page = vm_normal_page(migrate->vma, addr, pte); + if (page && !is_zone_device_page(page) && + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + else if (page && is_device_coherent_page(page) && + (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + page->pgmap->owner != migrate->pgmap_owner)) + goto next; mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } -- cgit v1.2.3 From 8012b866085523758780850087102421dbcce522 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:25 +0800 Subject: dax: introduce holder for dax_device Patch series "v14 fsdax-rmap + v11 fsdax-reflink", v2. The patchset fsdax-rmap is aimed to support shared pages tracking for fsdax. It moves owner tracking from dax_assocaite_entry() to pmem device driver, by introducing an interface ->memory_failure() for struct pagemap. This interface is called by memory_failure() in mm, and implemented by pmem device. Then call holder operations to find the filesystem which the corrupted data located in, and call filesystem handler to track files or metadata associated with this page. Finally we are able to try to fix the corrupted data in filesystem and do other necessary processing, such as killing processes who are using the files affected. The call trace is like this: memory_failure() |* fsdax case |------------ |pgmap->ops->memory_failure() => pmem_pgmap_memory_failure() | dax_holder_notify_failure() => | dax_device->holder_ops->notify_failure() => | - xfs_dax_notify_failure() | |* xfs_dax_notify_failure() | |-------------------------- | | xfs_rmap_query_range() | | xfs_dax_failure_fn() | | * corrupted on metadata | | try to recover data, call xfs_force_shutdown() | | * corrupted on file data | | try to recover data, call mf_dax_kill_procs() |* normal case |------------- |mf_generic_kill_procs() The patchset fsdax-reflink attempts to add CoW support for fsdax, and takes XFS, which has both reflink and fsdax features, as an example. One of the key mechanisms needed to be implemented in fsdax is CoW. Copy the data from srcmap before we actually write data to the destination iomap. And we just copy range in which data won't be changed. Another mechanism is range comparison. In page cache case, readpage() is used to load data on disk to page cache in order to be able to compare data. In fsdax case, readpage() does not work. So, we need another compare data with direct access support. With the two mechanisms implemented in fsdax, we are able to make reflink and fsdax work together in XFS. This patch (of 14): To easily track filesystem from a pmem device, we introduce a holder for dax_device structure, and also its operation. This holder is used to remember who is using this dax_device: - When it is the backend of a filesystem, the holder will be the instance of this filesystem. - When this pmem device is one of the targets in a mapped device, the holder will be this mapped device. In this case, the mapped device has its own dax_device and it will follow the first rule. So that we can finally track to the filesystem we needed. The holder and holder_ops will be set when filesystem is being mounted, or an target device is being activated. Link: https://lkml.kernel.org/r/20220603053738.1218681-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20220603053738.1218681-2-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Cc: Dave Chinner Cc: Jane Chu Cc: Goldwyn Rodrigues Cc: Al Viro Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Miaohe Lin Cc: Dan Williams Cc: Goldwyn Rodrigues Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++- drivers/md/dm.c | 2 +- fs/erofs/super.c | 10 ++++---- fs/ext2/super.c | 7 +++--- fs/ext4/super.c | 9 +++---- fs/xfs/xfs_buf.c | 5 ++-- include/linux/dax.h | 33 +++++++++++++++++++------- 7 files changed, 110 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 50a08b2ec247..9b5e2a5eb0ae 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -22,6 +22,8 @@ * @private: dax driver private data * @flags: state and boolean properties * @ops: operations for this device + * @holder_data: holder of a dax_device: could be filesystem or mapped device + * @holder_ops: operations for the inner holder */ struct dax_device { struct inode inode; @@ -29,6 +31,8 @@ struct dax_device { void *private; unsigned long flags; const struct dax_operations *ops; + void *holder_data; + const struct dax_holder_operations *holder_ops; }; static dev_t dax_devt; @@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host); * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax * @bdev: block device to find a dax_device for * @start_off: returns the byte offset into the dax_device that @bdev starts + * @holder: filesystem or mapped device inside the dax_device + * @ops: operations for the inner holder */ -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops) { struct dax_device *dax_dev; u64 part_size; @@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) dax_dev = NULL; + else if (holder) { + if (!cmpxchg(&dax_dev->holder_data, NULL, holder)) + dax_dev->holder_ops = ops; + else + dax_dev = NULL; + } dax_read_unlock(id); return dax_dev; } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); + +void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ + if (dax_dev && holder && + cmpxchg(&dax_dev->holder_data, holder, NULL) == holder) + dax_dev->holder_ops = NULL; + put_dax(dax_dev); +} +EXPORT_SYMBOL_GPL(fs_put_dax); #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ enum dax_device_flags { @@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_recovery_write); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, + u64 len, int mf_flags) +{ + int rc, id; + + id = dax_read_lock(); + if (!dax_alive(dax_dev)) { + rc = -ENXIO; + goto out; + } + + if (!dax_dev->holder_ops) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); +out: + dax_read_unlock(id); + return rc; +} +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) @@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; + if (dax_dev->holder_data != NULL) + dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); + + /* clear holder data */ + dax_dev->holder_ops = NULL; + dax_dev->holder_data = NULL; } EXPORT_SYMBOL_GPL(kill_dax); @@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); +/** + * dax_holder() - obtain the holder of a dax device + * @dax_dev: a dax_device instance + + * Return: the holder's data which represents the holder if registered, + * otherwize NULL. + */ +void *dax_holder(struct dax_device *dax_dev) +{ + return dax_dev->holder_data; +} +EXPORT_SYMBOL_GPL(dax_holder); + /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b75f1ef7386..0177a4ce9a18 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); return 0; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 95addc5c9d34..3173debeaa5a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, + NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off); + &sbi->dax_part_off, + NULL, NULL); } err = erofs_read_superblock(sb); @@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; - fs_put_dax(dif->dax_dev); + fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); erofs_fscache_unregister_cookie(&dif->fscache); @@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb) return; erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev); + fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..4638946251b9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -1204,7 +1205,7 @@ failed_mount_group_desc: failed_mount: brelse(bh); failed_sbi: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 845f2f8aee5f..1f8bf507ba5a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi) return; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) if (!sbi) return NULL; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); @@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) sbi->s_sb = sb; return sbi; err_out: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4aa9c9cf5b6e..1ec2a7b6d44e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1911,7 +1911,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); - fs_put_dax(btp->bt_daxdev); + fs_put_dax(btp->bt_daxdev, NULL); kmem_free(btp); } @@ -1964,7 +1964,8 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL, + NULL); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/include/linux/dax.h b/include/linux/dax.h index e7b81634c52a..cf85fc36da5f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -43,8 +43,21 @@ struct dax_operations { void *addr, size_t bytes, struct iov_iter *iter); }; +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); +void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void *dax_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { @@ -114,12 +131,9 @@ struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -static inline void fs_put_dax(struct dax_device *dax_dev) -{ - put_dax(dax_dev); -} +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops); +void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off) + u64 *start_off, void *holder, + const struct dax_holder_operations *ops) { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ @@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From 33a8f7f2b3a3437d016d1b4047a4fd37eb6951b3 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:27 +0800 Subject: pagemap,pmem: introduce ->memory_failure() When memory-failure occurs, we call this function which is implemented by each kind of devices. For the fsdax case, pmem device driver implements it. Pmem device driver will find out the filesystem in which the corrupted page located in. With dax_holder notify support, we are able to notify the memory failure from pmem driver to upper layers. If there is something not support in the notify routine, memory_failure will fall back to the generic hanlder. Link: https://lkml.kernel.org/r/20220603053738.1218681-4-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Reviewed-by: Naoya Horiguchi Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- drivers/nvdimm/pmem.c | 17 +++++++++++++++++ include/linux/memremap.h | 12 ++++++++++++ mm/memory-failure.c | 14 ++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'include') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 629d10fcf53b..107c9cb3d57d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem) blk_cleanup_disk(pmem->disk); } +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct pmem_device *pmem = + container_of(pgmap, struct pmem_device, pgmap); + u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags); +} + +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .memory_failure = pmem_pagemap_memory_failure, +}; + static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; if (is_nd_pfn(dev)) { pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pgmap.range.end = res->end; pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; bb_range = pmem->pgmap.range; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 09320b7f706c..19010491a603 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -87,6 +87,18 @@ struct dev_pagemap_ops { * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); + + /* + * Handle the memory failure happens on a range of pfns. Notify the + * processes who are using these pfns, and try to recover the data on + * them if necessary. The mf_flags is finally passed to the recover + * function through the whole notify routine. + * + * When this is not implemented, or it returns -EOPNOTSUPP, the caller + * will fall back to a common handler called mf_generic_kill_procs(). + */ + int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, + unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f8a8a5d45eba..46c77151f726 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1748,6 +1748,20 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, if (!pgmap_pfn_valid(pgmap, pfn)) goto out; + /* + * Call driver's implementation to handle the memory failure, otherwise + * fall back to generic handler. + */ + if (pgmap->ops->memory_failure) { + rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); + /* + * Fall back to generic handler too if operation is not + * supported inside the driver/device/filesystem. + */ + if (rc != -EOPNOTSUPP) + goto out; + } + rc = mf_generic_kill_procs(pfn, flags, pgmap); out: /* drop pgmap ref acquired in caller */ -- cgit v1.2.3 From 2f437effc689ef913fbe5e31110580b4e7cf04be Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:28 +0800 Subject: fsdax: introduce dax_lock_mapping_entry() The current dax_lock_page() locks dax entry by obtaining mapping and index in page. To support 1-to-N RMAP in NVDIMM, we need a new function to lock a specific dax entry corresponding to this file's mapping,index. And output the page corresponding to the specific dax entry for caller use. Link: https://lkml.kernel.org/r/20220603053738.1218681-5-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 15 +++++++++++++ 2 files changed, 78 insertions(+) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 4155a6107fa1..65e44d78b3bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -455,6 +455,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie) dax_unlock_entry(&xas, (void *)cookie); } +/* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping + * @mapping: the file's mapping whose entry we want to lock + * @index: the offset within this file + * @page: output the dax page corresponding to this dax entry + * + * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry + * could not be locked. + */ +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, + struct page **page) +{ + XA_STATE(xas, NULL, 0); + void *entry; + + rcu_read_lock(); + for (;;) { + entry = NULL; + if (!dax_mapping(mapping)) + break; + + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); + xas_set(&xas, index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + rcu_read_unlock(); + wait_entry_unlocked(&xas, entry); + rcu_read_lock(); + continue; + } + if (!entry || + dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Because we are looking for entry from file's mapping + * and index, so the entry may not be inserted for now, + * or even a zero/empty entry. We don't think this is + * an error case. So, return a special value and do + * not output @page. + */ + entry = (void *)~0UL; + } else { + *page = pfn_to_page(dax_to_pfn(entry)); + dax_lock_entry(&xas, entry); + } + xas_unlock_irq(&xas); + break; + } + rcu_read_unlock(); + return (dax_entry_t)entry; +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, + dax_entry_t cookie) +{ + XA_STATE(xas, &mapping->i_pages, index); + + if (cookie == ~0UL) + return; + + dax_unlock_entry(&xas, (void *)cookie); +} + /* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at diff --git a/include/linux/dax.h b/include/linux/dax.h index cf85fc36da5f..7116681b48c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -161,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page); +void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { @@ -188,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page) static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) { } + +static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page) +{ + return 0; +} + +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie) +{ +} #endif int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, -- cgit v1.2.3 From c36e2024957120566efd99395b5c8cc95b5175c1 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:29 +0800 Subject: mm: introduce mf_dax_kill_procs() for fsdax case This new function is a variant of mf_generic_kill_procs that accepts a file, offset pair instead of a struct to support multiple files sharing a DAX mapping. It is intended to be called by the file systems as part of the memory_failure handler after the file system performed a reverse mapping from the storage address to the file and file offset. Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Dan Williams Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Miaohe Lin Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory-failure.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 64393ed3330a..d4ebfc206e2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3178,6 +3178,8 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, }; +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46c77151f726..c9931c676335 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -297,10 +297,9 @@ void shake_page(struct page *p) } EXPORT_SYMBOL_GPL(shake_page); -static unsigned long dev_pagemap_mapping_shift(struct page *page, - struct vm_area_struct *vma) +static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, + unsigned long address) { - unsigned long address = vma_address(page, vma); unsigned long ret = 0; pgd_t *pgd; p4d_t *p4d; @@ -340,10 +339,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * + * Notice: @fsdax_pgoff is used only when @p is a fsdax page. + * In other cases, such as anonymous and file-backend page, the address to be + * killed can be caculated by @p itself. */ static void add_to_kill(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) + pgoff_t fsdax_pgoff, struct vm_area_struct *vma, + struct list_head *to_kill) { struct to_kill *tk; @@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } tk->addr = page_address_in_vma(p, vma); - if (is_zone_device_page(p)) - tk->size_shift = dev_pagemap_mapping_shift(p, vma); - else + if (is_zone_device_page(p)) { + /* + * Since page->mapping is not used for fsdax, we need + * calculate the address based on the vma. + */ + if (p->pgmap->type == MEMORY_DEVICE_FS_DAX) + tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + } else tk->size_shift = page_shift(compound_head(p)); /* @@ -505,7 +514,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill); + add_to_kill(t, page, 0, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -541,13 +550,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill); + add_to_kill(t, page, 0, vma, to_kill); } } read_unlock(&tasklist_lock); i_mmap_unlock_read(mapping); } +#ifdef CONFIG_FS_DAX +/* + * Collect processes when the error hit a fsdax page. + */ +static void collect_procs_fsdax(struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + read_lock(&tasklist_lock); + for_each_process(tsk) { + struct task_struct *t = task_early_kill(tsk, true); + + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_mm == t->mm) + add_to_kill(t, page, pgoff, vma, to_kill); + } + } + read_unlock(&tasklist_lock); + i_mmap_unlock_read(mapping); +} +#endif /* CONFIG_FS_DAX */ + /* * Collect the processes who have the corrupted page mapped to kill. */ @@ -1588,6 +1625,45 @@ unlock: return rc; } +#ifdef CONFIG_FS_DAX +/** + * mf_dax_kill_procs - Collect and kill processes who are using this file range + * @mapping: address_space of the file in use + * @index: start pgoff of the range within the file + * @count: length of the range, in unit of PAGE_SIZE + * @mf_flags: memory failure flags + */ +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags) +{ + LIST_HEAD(to_kill); + dax_entry_t cookie; + struct page *page; + size_t end = index + count; + + mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + for (; index < end; index++) { + page = NULL; + cookie = dax_lock_mapping_entry(mapping, index, &page); + if (!cookie) + return -EBUSY; + if (!page) + goto unlock; + + SetPageHWPoison(page); + + collect_procs_fsdax(page, mapping, index, &to_kill); + unmap_and_kill(&to_kill, page_to_pfn(page), mapping, + index, mf_flags); +unlock: + dax_unlock_mapping_entry(mapping, index, cookie); + } + return 0; +} +EXPORT_SYMBOL_GPL(mf_dax_kill_procs); +#endif /* CONFIG_FS_DAX */ + /* * Called from hugetlb code with hugetlb_lock held. * -- cgit v1.2.3 From 6061b69b9a550a2ab84e805d0d2315ba6215f112 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:31 +0800 Subject: fsdax: set a CoW flag when associate reflink mappings Introduce a PAGE_MAPPING_DAX_COW flag to support association with CoW file mappings. In this case, since the dax-rmap has already took the responsibility to look up for shared files by given dax page, the page->mapping is no longer to used for rmap but for marking that this dax page is shared. And to make sure disassociation works fine, we use page->index as refcount, and clear page->mapping to the initial state when page->index is decreased to 0. With the help of this new flag, it is able to distinguish normal case and CoW case, and keep the warning in normal case. Link: https://lkml.kernel.org/r/20220603053738.1218681-8-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 50 +++++++++++++++++++++++++++++++++++++--------- include/linux/page-flags.h | 6 ++++++ 2 files changed, 47 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 65e44d78b3bb..b59b864017ad 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) +static inline bool dax_mapping_is_cow(struct address_space *mapping) +{ + return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; +} + /* - * TODO: for reflink+dax we need a way to associate a single page with - * multiple address_space instances at different linear_page_index() - * offsets. + * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + */ +static inline void dax_mapping_set_cow(struct page *page) +{ + if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + /* + * Reset the index if the page was already mapped + * regularly before. + */ + if (page->mapping) + page->index = 1; + page->mapping = (void *)PAGE_MAPPING_DAX_COW; + } + page->index++; +} + +/* + * When it is called in dax_insert_entry(), the cow flag will indicate that + * whether this entry is shared by multiple files. If so, set the page->mapping + * FS_DAX_MAPPING_COW, and use page->index as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool cow) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + if (cow) { + dax_mapping_set_cow(page); + } else { + WARN_ON_ONCE(page->mapping); + page->mapping = mapping; + page->index = index + i++; + } } } @@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - WARN_ON_ONCE(page->mapping && page->mapping != mapping); + if (dax_mapping_is_cow(page->mapping)) { + /* keep the CoW flag if this page is still shared */ + if (page->index-- > 0) + continue; + } else + WARN_ON_ONCE(page->mapping && page->mapping != mapping); page->mapping = NULL; page->index = 0; } @@ -830,7 +861,8 @@ static void *dax_insert_entry(struct xa_state *xas, void *old; dax_disassociate_entry(entry, mapping, false); - dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); + dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, + false); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 82719d33c0f1..f2ff65f1bf83 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -661,6 +661,12 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +/* + * Different with flags above, this flag is used only for fsdax mode. It + * indicates that this page->mapping is now under reflink case. + */ +#define PAGE_MAPPING_DAX_COW 0x1 + static __always_inline bool folio_mapping_flags(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; -- cgit v1.2.3 From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:36 +0800 Subject: fsdax: dedup file range to use a compare function With dax we cannot deal with readpage() etc. So, we create a dax comparison function which is similar with vfs_dedupe_file_range_compare(). And introduce dax_remap_file_range_prep() for filesystem use. Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com Signed-off-by: Goldwyn Rodrigues Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/remap_range.c | 31 ++++++++++++++++---- fs/xfs/xfs_reflink.c | 8 +++-- include/linux/dax.h | 8 +++++ include/linux/fs.h | 12 +++++--- 5 files changed, 130 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 0aab32300531..e0f9c4a0a0c1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1873,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); + +static loff_t dax_range_compare_iter(struct iomap_iter *it_src, + struct iomap_iter *it_dest, u64 len, bool *same) +{ + const struct iomap *smap = &it_src->iomap; + const struct iomap *dmap = &it_dest->iomap; + loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + void *saddr, *daddr; + int id, ret; + + len = min(len, min(smap->length, dmap->length)); + + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { + *same = true; + return len; + } + + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { + *same = false; + return 0; + } + + id = dax_read_lock(); + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), + &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), + &daddr, NULL); + if (ret < 0) + goto out_unlock; + + *same = !memcmp(saddr, daddr, len); + if (!*same) + len = 0; + dax_read_unlock(id); + return len; + +out_unlock: + dax_read_unlock(id); + return -EIO; +} + +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dst, loff_t dstoff, loff_t len, bool *same, + const struct iomap_ops *ops) +{ + struct iomap_iter src_iter = { + .inode = src, + .pos = srcoff, + .len = len, + .flags = IOMAP_DAX, + }; + struct iomap_iter dst_iter = { + .inode = dst, + .pos = dstoff, + .len = len, + .flags = IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&src_iter, ops)) > 0) { + while ((ret = iomap_iter(&dst_iter, ops)) > 0) { + dst_iter.processed = dax_range_compare_iter(&src_iter, + &dst_iter, len, same); + } + if (ret <= 0) + src_iter.processed = ret; + } + return ret; +} + +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, ops); +} +EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..231de627c1b9 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #include @@ -271,9 +272,11 @@ out_error: * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +int +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -333,8 +336,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - ret = vfs_dedupe_file_range_compare(file_in, pos_in, - file_out, pos_out, *len, &is_same); + if (*len == 0) + return 0; + + if (!IS_DAX(inode_in)) + ret = vfs_dedupe_file_range_compare(file_in, pos_in, + file_out, pos_out, *len, &is_same); + else if (dax_read_ops) + ret = dax_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same, + dax_read_ops); + else + return -EINVAL; if (ret) return ret; if (!is_same) @@ -352,6 +365,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } + +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, NULL); +} EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..cbaf36d21020 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1367,8 +1367,12 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); + if (!IS_DAX(inode_in)) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags); + else + ret = dax_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; diff --git a/include/linux/dax.h b/include/linux/dax.h index 7116681b48c0..ba985333e26b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same, + const struct iomap_ops *ops); +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..134e9d7ad5d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,7 @@ struct fsverity_operations; struct fs_context; struct fs_parameter_spec; struct fileattr; +struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, - unsigned int remap_flags); +int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops); +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -- cgit v1.2.3 From 4fa6893faeaaea4fe4440512d2a708527ef47051 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:35 -0700 Subject: mm: thp: consolidate vma size check to transhuge_vma_suitable There are couple of places that check whether the vma size is ok for THP or whether address fits, they are open coded and duplicate, use transhuge_vma_suitable() to do the job by passing in (vma->end - HPAGE_PMD_SIZE). Move vma size check into hugepage_vma_check(). This will make khugepaged_enter() is as same as khugepaged_enter_vma(). There is just one caller for khugepaged_enter(), replace it to khugepaged_enter_vma() and remove khugepaged_enter(). Link: https://lkml.kernel.org/r/20220616174840.1202070-3-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 11 +++++++++++ include/linux/khugepaged.h | 14 -------------- mm/huge_memory.c | 2 +- mm/khugepaged.c | 19 ++++++------------- 4 files changed, 18 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 648cb3ce7099..8a5a8bfce0f5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,17 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +/* + * Do the below checks: + * - For file vma, check if the linear page offset of vma is + * HPAGE_PMD_NR aligned within the file. The hugepage is + * guaranteed to be hugepage-aligned within the file, but we must + * check that the PMD-aligned addresses in the VMA map to + * PMD-aligned offsets within the file, else the hugepage will + * not be PMD-mappable. + * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE + * area. + */ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 392d34c3c59a..31ca8a7f78f4 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -51,16 +51,6 @@ static inline void khugepaged_exit(struct mm_struct *mm) if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) __khugepaged_exit(mm); } - -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags)) - __khugepaged_enter(vma->vm_mm); - } -} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -68,10 +58,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ -} static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a563de8234c1..2751649aaf33 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -726,7 +726,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3eec970a884d..c7e22135f1b5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -443,8 +443,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (vma_is_dax(vma)) return false; - if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - - vma->vm_pgoff, HPAGE_PMD_NR)) + /* Check alignment for file vma and size for both file and anon vma */ + if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; /* Enabled via shmem mount options or sysfs settings. */ @@ -505,9 +505,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled() && - (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK))) { + khugepaged_enabled()) { if (hugepage_vma_check(vma, vm_flags)) __khugepaged_enter(vma->vm_mm); } @@ -948,7 +946,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct vm_area_struct **vmap) { struct vm_area_struct *vma; - unsigned long hstart, hend; if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; @@ -957,9 +954,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) + if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; @@ -2135,10 +2130,8 @@ skip: progress++; continue; } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) - goto skip; + hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); + hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) -- cgit v1.2.3 From 9fec51689ff60d9766b38051a0b1692f93d95364 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:37 -0700 Subject: mm: thp: kill transparent_hugepage_active() The transparent_hugepage_active() was introduced to show THP eligibility bit in smaps in proc, smaps is the only user. But it actually does the similar check as hugepage_vma_check() which is used by khugepaged. We definitely don't have to maintain two similar checks, so kill transparent_hugepage_active(). This patch also fixed the wrong behavior for VM_NO_KHUGEPAGED vmas. Also move hugepage_vma_check() to huge_memory.c and huge_mm.h since it is not only for khugepaged anymore. [akpm@linux-foundation.org: check vma->vm_mm, per Zach] [akpm@linux-foundation.org: add comment to vdso check] Link: https://lkml.kernel.org/r/20220616174840.1202070-5-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 16 ++++++++------ include/linux/khugepaged.h | 2 -- mm/huge_memory.c | 53 ++++++++++++++++++++++++++++++++++++++-------- mm/khugepaged.c | 48 ++++------------------------------------- 5 files changed, 59 insertions(+), 62 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1d7fd832123b..072cf770b5d0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - transparent_hugepage_active(vma)); + hugepage_vma_check(vma, vma->vm_flags, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a5a8bfce0f5..64487bcd0c7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -202,7 +202,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool transparent_hugepage_active(struct vm_area_struct *vma); +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -351,11 +353,6 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static inline bool transparent_hugepage_active(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { @@ -368,6 +365,13 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, return false; } +static inline bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps) +{ + return false; +} + static inline void prep_transhuge_page(struct page *page) {} #define transparent_hugepage_flags 0UL diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 31ca8a7f78f4..ea5fd4c398f7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2751649aaf33..8cbd21aaf03e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,21 +69,56 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_active(struct vm_area_struct *vma) +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps) { - /* The addr is used to check if the vma size fits */ - unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + if (!vma->vm_mm) /* vdso */ + return false; + + if (!transhuge_vma_enabled(vma, vm_flags)) + return false; - if (!transhuge_vma_suitable(vma, addr)) + if (vm_flags & VM_NO_KHUGEPAGED) return false; - if (vma_is_anonymous(vma)) - return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma)) + + /* Don't run khugepaged against DAX vma */ + if (vma_is_dax(vma)) + return false; + + /* Check alignment for file vma and size for both file and anon vma */ + if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) + return false; + + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + + if (!khugepaged_enabled()) + return false; + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + return false; + + /* Only regular file is valid */ + if (file_thp_enabled(vma)) return true; - return false; + if (!vma_is_anonymous(vma)) + return false; + + if (vma_is_temporary_stack(vma)) + return false; + + /* + * THPeligible bit of smaps should show 1 for proper VMAs even + * though anon_vma is not initialized yet. + */ + if (!vma->anon_vma) + return smaps; + + return true; } static bool get_huge_zero_page(void) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 67e144e64b7f..6bbf3adac534 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -430,46 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!transhuge_vma_enabled(vma, vm_flags)) - return false; - - if (vm_flags & VM_NO_KHUGEPAGED) - return false; - - /* Don't run khugepaged against DAX vma */ - if (vma_is_dax(vma)) - return false; - - /* Check alignment for file vma and size for both file and anon vma */ - if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) - return false; - - /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); - - if (!khugepaged_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) - return false; - - /* Only regular file is valid */ - if (file_thp_enabled(vma)) - return true; - - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return false; - if (vma_is_temporary_stack(vma)) - return false; - - return true; -} - void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; @@ -506,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags)) + if (hugepage_vma_check(vma, vm_flags, false)) __khugepaged_enter(vma->vm_mm); } } @@ -956,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags)) + if (!hugepage_vma_check(vma, vma->vm_flags, false)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1441,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() * will not fail the vma for missing VM_HUGEPAGE */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2131,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false)) { skip: progress++; continue; -- cgit v1.2.3 From 7da4e2cb8b1ff8221759bfc7512d651ee69516dc Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:38 -0700 Subject: mm: thp: kill __transhuge_page_enabled() The page fault path checks THP eligibility with __transhuge_page_enabled() which does the similar thing as hugepage_vma_check(), so use hugepage_vma_check() instead. However page fault allows DAX and !anon_vma cases, so added a new flag, in_pf, to hugepage_vma_check() to make page fault work correctly. The in_pf flag is also used to skip shmem and file THP for page fault since shmem handles THP in its own shmem_fault() and file THP allocation on fault is not supported yet. Also remove hugepage_vma_enabled() since hugepage_vma_check() is the only caller now, it is not necessary to have a helper function. Link: https://lkml.kernel.org/r/20220616174840.1202070-6-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 57 ++----------------------------------------------- mm/huge_memory.c | 51 +++++++++++++++++++++++++++++++++---------- mm/khugepaged.c | 8 +++---- mm/memory.c | 7 ++++-- 5 files changed, 52 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 072cf770b5d0..a3398d0f1927 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true)); + hugepage_vma_check(vma, vma->vm_flags, true, false)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 64487bcd0c7b..cd8a6c5d9fe5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -146,48 +146,6 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* Explicitly disabled through madvise. */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - return true; -} - -/* - * to be used on vmas which are known to support THP. - * Use transparent_hugepage_active otherwise - */ -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - - /* - * If the hardware/firmware marked hugepage support disabled. - */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) - return false; - - if (!transhuge_vma_enabled(vma, vma->vm_flags)) - return false; - - if (vma_is_temporary_stack(vma)) - return false; - - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) - return true; - - if (vma_is_dax(vma)) - return true; - - if (transparent_hugepage_flags & - (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) - return !!(vma->vm_flags & VM_HUGEPAGE); - - return false; -} - static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -204,7 +162,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps); + bool smaps, bool in_pf); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -348,26 +306,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { return false; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - return false; -} - static inline bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps) + bool smaps, bool in_pf) { return false; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8cbd21aaf03e..4b90c7021e52 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -71,27 +71,53 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps) + bool smaps, bool in_pf) { if (!vma->vm_mm) /* vdso */ return false; - if (!transhuge_vma_enabled(vma, vm_flags)) + /* + * Explicitly disabled through madvise or prctl, or some + * architectures may disable THP for some mappings, for + * example, s390 kvm. + * */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - - if (vm_flags & VM_NO_KHUGEPAGED) + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; - /* Don't run khugepaged against DAX vma */ + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) + return in_pf; + + /* + * Special VMA and hugetlb VMA. + * Must be checked after dax since some dax mappings may have + * VM_MIXEDMAP set. + */ + if (vm_flags & VM_NO_KHUGEPAGED) return false; - /* Check alignment for file vma and size for both file and anon vma */ - if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) + /* + * Check alignment for file vma and size for both file and anon vma. + * + * Skip the check for page fault. Huge fault does the check in fault + * handlers. And this check is not suitable for huge PUD fault. + */ + if (!in_pf && + !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; - /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file)) + /* + * Enabled via shmem mount options or sysfs settings. + * Must be done before hugepage flags check since shmem has its + * own flags. + */ + if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); if (!khugepaged_enabled()) @@ -102,7 +128,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, return false; /* Only regular file is valid */ - if (file_thp_enabled(vma)) + if (!in_pf && file_thp_enabled(vma)) return true; if (!vma_is_anonymous(vma)) @@ -114,9 +140,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma, /* * THPeligible bit of smaps should show 1 for proper VMAs even * though anon_vma is not initialized yet. + * + * Allow page fault since anon_vma may be not initialized until + * the first page fault. */ if (!vma->anon_vma) - return smaps; + return (smaps || in_pf); return true; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6bbf3adac534..d683ef1edeb5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -466,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false)) + if (hugepage_vma_check(vma, vm_flags, false, false)) __khugepaged_enter(vma->vm_mm); } } @@ -916,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1401,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() * will not fail the vma for missing VM_HUGEPAGE */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false)) + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2091,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { skip: progress++; continue; diff --git a/mm/memory.c b/mm/memory.c index dce0b2e686eb..2392d5db473a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4970,6 +4970,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; + unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -4983,7 +4984,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (!vmf.pud) return VM_FAULT_OOM; retry_pud: - if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { + if (pud_none(*vmf.pud) && + hugepage_vma_check(vma, vm_flags, false, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5016,7 +5018,8 @@ retry_pud: if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { + if (pmd_none(*vmf.pmd) && + hugepage_vma_check(vma, vm_flags, false, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- cgit v1.2.3 From 1064026bab9f011bdea1251d44d66bbbcee04f6e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:39 -0700 Subject: mm: khugepaged: reorg some khugepaged helpers The khugepaged_{enabled|always|req_madv} are not khugepaged only anymore, move them to huge_mm.h and rename to hugepage_flags_xxx, and remove khugepaged_req_madv due to no users. Also move khugepaged_defrag to khugepaged.c since its only caller is in that file, it doesn't have to be in a header file. Link: https://lkml.kernel.org/r/20220616174840.1202070-7-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++++++++ include/linux/khugepaged.h | 14 -------------- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 18 +++++++++++------- 4 files changed, 21 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cd8a6c5d9fe5..ae3d8e2fd9e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,14 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +#define hugepage_flags_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4b90c7021e52..8e1b3d9f7ebf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -120,11 +120,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!khugepaged_enabled()) + if (!hugepage_flags_enabled()) return false; /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) return false; /* Only regular file is valid */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d683ef1edeb5..01f71786d530 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -465,7 +465,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled()) { + hugepage_flags_enabled()) { if (hugepage_vma_check(vma, vm_flags, false, false)) __khugepaged_enter(vma->vm_mm); } @@ -761,6 +761,10 @@ static bool khugepaged_scan_abort(int nid) return false; } +#define khugepaged_defrag() \ + (transparent_hugepage_flags & \ + (1< Date: Tue, 21 Jun 2022 16:56:17 -0700 Subject: hugetlb: skip to end of PT page mapping when pte not present Patch series "hugetlb: speed up linear address scanning", v2. At unmap, fork and remap time hugetlb address ranges are linearly scanned. We can optimize these scans if the ranges are sparsely populated. Also, enable page table "Lazy copy" for hugetlb at fork. NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to add an arch specific version hugetlb_mask_last_page() to take advantage of sparse address scanning improvements. Baolin Wang added the routine for arm64. Other architectures which could be optimized are: ia64, mips, parisc, powerpc, s390, sh and sparc. This patch (of 4): HugeTLB address ranges are linearly scanned during fork, unmap and remap operations. If a non-present entry is encountered, the code currently continues to the next huge page aligned address. However, a non-present entry implies that the page table page for that entry is not present. Therefore, the linear scan can skip to the end of range mapped by the page table page. This can speed operations on large sparsely populated hugetlb mappings. Create a new routine hugetlb_mask_last_page() that will return an address mask. When the mask is ORed with an address, the result will be the address of the last huge page mapped by the associated page table page. Use this mask to update addresses in routines which linearly scan hugetlb address ranges when a non-present pte is encountered. hugetlb_mask_last_page is related to the implementation of huge_pte_offset as hugetlb_mask_last_page is called when huge_pte_offset returns NULL. This patch only provides a complete hugetlb_mask_last_page implementation when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined. Architectures which provide their own versions of huge_pte_offset can also provide their own version of hugetlb_mask_last_page. Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: Muchun Song Reported-by: kernel test robot Cc: Michal Hocko Cc: Peter Xu Cc: Naoya Horiguchi Cc: James Houghton Cc: Mina Almasry Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Paul Walmsley Cc: Christian Borntraeger Cc: Catalin Marinas Cc: Will Deacon Cc: Rolf Eike Beer Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c6cccfaf8708..ce30fad5fd13 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -194,6 +194,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); +unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ffdf3fc4a83f..95fd1c36c17f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4727,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; + unsigned long last_addr_mask; int ret = 0; if (cow) { @@ -4746,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, i_mmap_lock_read(mapping); } + last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); - if (!src_pte) + if (!src_pte) { + addr |= last_addr_mask; continue; + } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -4767,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + addr |= last_addr_mask; continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -4928,6 +4934,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; + unsigned long last_addr_mask; unsigned long old_addr_copy; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; @@ -4943,12 +4950,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); - if (!src_pte) + if (!src_pte) { + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; + } if (huge_pte_none(huge_ptep_get(src_pte))) continue; @@ -4993,6 +5004,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); @@ -5013,11 +5025,14 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, &address, ptep)) { @@ -6285,6 +6300,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; + unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -6301,12 +6317,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, &address, ptep)) { /* @@ -6856,6 +6875,33 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return (pte_t *)pmd; } +/* + * Return a mask that can be used to update an address to the last huge + * page in a page table page mapping size. Used to skip non-present + * page table entries when linearly scanning address ranges. Architectures + * with unique huge page to page table relationships can define their own + * version of this routine. + */ +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + if (hp_size == PUD_SIZE) + return P4D_SIZE - PUD_SIZE; + else if (hp_size == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; + else + return 0UL; +} + +#else + +/* See description above. Architectures can provide their own version. */ +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + return 0UL; +} + #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* -- cgit v1.2.3 From 4ddb4d91b82f4b64458fe35bc8e395c7c082ea2b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 21 Jun 2022 16:56:19 -0700 Subject: hugetlb: do not update address in huge_pmd_unshare As an optimization for loops sequentially processing hugetlb address ranges, huge_pmd_unshare would update a passed address if it unshared a pmd. Updating a loop control variable outside the loop like this is generally a bad idea. These loops are now using hugetlb_mask_last_page to optimize scanning when non-present ptes are discovered. The same can be done when huge_pmd_unshare returns 1 indicating a pmd was unshared. Remove address update from huge_pmd_unshare. Change the passed argument type and update all callers. In loops sequentially processing addresses use hugetlb_mask_last_page to update address if pmd is unshared. [sfr@canb.auug.org.au: fix an unused variable warning/error] Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Stephen Rothwell Acked-by: Muchun Song Reviewed-by: Baolin Wang Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David Hildenbrand Cc: James Houghton Cc: kernel test robot Cc: Michal Hocko Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Paul Walmsley Cc: Peter Xu Cc: Rolf Eike Beer Cc: Will Deacon Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 44 +++++++++++++++++--------------------------- mm/rmap.c | 4 ++-- 3 files changed, 21 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ce30fad5fd13..75ee739d815b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -196,7 +196,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep); + unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, @@ -243,7 +243,7 @@ static inline struct address_space *hugetlb_page_mapping_lock_write( static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 95fd1c36c17f..96635a2874e3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4935,7 +4935,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; unsigned long last_addr_mask; - unsigned long old_addr_copy; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; @@ -4963,14 +4962,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (huge_pte_none(huge_ptep_get(src_pte))) continue; - /* old_addr arg to huge_pmd_unshare() is a pointer and so the - * arg may be modified. Pass a copy instead to preserve the - * value in old_addr. - */ - old_addr_copy = old_addr; - - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; } @@ -5035,10 +5030,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; + address |= last_addr_mask; continue; } @@ -6327,7 +6323,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6337,6 +6333,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pages++; spin_unlock(ptl); shared_pmd = true; + address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); @@ -6759,11 +6756,11 @@ out: * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { - pgd_t *pgd = pgd_offset(mm, *addr); - p4d_t *p4d = p4d_offset(pgd, *addr); - pud_t *pud = pud_offset(p4d, *addr); + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); BUG_ON(page_count(virt_to_page(ptep)) == 0); @@ -6773,14 +6770,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - /* - * This update of passed address optimizes loops sequentially - * processing addresses in increments of huge page size (PMD_SIZE - * in this case). By clearing the pud, a PUD_SIZE area is unmapped. - * Update address to the 'last page' in the cleared area so that - * calling loop can move to first page past this area. - */ - *addr |= PUD_SIZE - PMD_SIZE; return 1; } @@ -6792,7 +6781,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } @@ -6899,6 +6888,10 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + if (huge_page_size(h) == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; +#endif return 0UL; } @@ -7125,14 +7118,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - unsigned long tmp = address; - ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - /* We don't want 'address' to be changed */ - huge_pmd_unshare(mm, vma, &tmp, ptep); + huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); diff --git a/mm/rmap.c b/mm/rmap.c index 0532fd92ecb3..fb6b3b47f3e4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1559,7 +1559,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -1920,7 +1920,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); -- cgit v1.2.3 From bf75f200569dd05ac2112797f44548beb6b4be26 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:17 +0100 Subject: mm/page_alloc: add page->buddy_list and page->pcp_list Patch series "Drain remote per-cpu directly", v5. Some setups, notably NOHZ_FULL CPUs, may be running realtime or latency-sensitive applications that cannot tolerate interference due to per-cpu drain work queued by __drain_all_pages(). Introduce a new mechanism to remotely drain the per-cpu lists. It is made possible by remotely locking 'struct per_cpu_pages' new per-cpu spinlocks. This has two advantages, the time to drain is more predictable and other unrelated tasks are not interrupted. This series has the same intent as Nicolas' series "mm/page_alloc: Remote per-cpu lists drain support" -- avoid interference of a high priority task due to a workqueue item draining per-cpu page lists. While many workloads can tolerate a brief interruption, it may cause a real-time task running on a NOHZ_FULL CPU to miss a deadline and at minimum, the draining is non-deterministic. Currently an IRQ-safe local_lock protects the page allocator per-cpu lists. The local_lock on its own prevents migration and the IRQ disabling protects from corruption due to an interrupt arriving while a page allocation is in progress. This series adjusts the locking. A spinlock is added to struct per_cpu_pages to protect the list contents while local_lock_irq is ultimately replaced by just the spinlock in the final patch. This allows a remote CPU to safely. Follow-on work should allow the spin_lock_irqsave to be converted to spin_lock to avoid IRQs being disabled/enabled in most cases. The follow-on patch will be one kernel release later as it is relatively high risk and it'll make bisections more clear if there are any problems. Patch 1 is a cosmetic patch to clarify when page->lru is storing buddy pages and when it is storing per-cpu pages. Patch 2 shrinks per_cpu_pages to make room for a spin lock. Strictly speaking this is not necessary but it avoids per_cpu_pages consuming another cache line. Patch 3 is a preparation patch to avoid code duplication. Patch 4 is a minor correction. Patch 5 uses a spin_lock to protect the per_cpu_pages contents while still relying on local_lock to prevent migration, stabilise the pcp lookup and prevent IRQ reentrancy. Patch 6 remote drains per-cpu pages directly instead of using a workqueue. Patch 7 uses a normal spinlock instead of local_lock for remote draining This patch (of 7): The page allocator uses page->lru for storing pages on either buddy or PCP lists. Create page->buddy_list and page->pcp_list as a union with page->lru. This is simply to clarify what type of list a page is on in the page allocator. No functional change intended. [minchan@kernel.org: fix page lru fields in macros] Link: https://lkml.kernel.org/r/20220624125423.6126-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Reviewed-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Marcelo Tosatti Cc: Michal Hocko Cc: Hugh Dickins Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ mm/page_alloc.c | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b961a29bf26..cf97f3884fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -87,6 +87,7 @@ struct page { */ union { struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ @@ -94,6 +95,10 @@ struct page { /* Count page's or folio's mlocks */ unsigned int mlock_count; }; + + /* Or, free page */ + struct list_head buddy_list; + struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c9c02b23f02f..78ba5ba66586 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -793,7 +793,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, return false; __SetPageGuard(page); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -936,7 +936,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add(&page->lru, &area->free_list[migratetype]); + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -946,7 +946,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->lru, &area->free_list[migratetype]); + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -960,7 +960,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->buddy_list, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -970,7 +970,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, if (page_reported(page)) __ClearPageReported(page); - list_del(&page->lru); + list_del(&page->buddy_list); __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; @@ -1508,11 +1508,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; - page = list_last_entry(list, struct page, lru); + page = list_last_entry(list, struct page, pcp_list); mt = get_pcppage_migratetype(page); /* must delete to avoid corrupting pcp list */ - list_del(&page->lru); + list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; @@ -3072,7 +3072,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * for IO devices that can merge IO requests if the physical * pages are ordered properly. */ - list_add_tail(&page->lru, list); + list_add_tail(&page->pcp_list, list); allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -3322,7 +3322,7 @@ void mark_free_pages(struct zone *zone) for_each_migratetype_order(order, t) { list_for_each_entry(page, - &zone->free_area[order].free_list[t], lru) { + &zone->free_area[order].free_list[t], buddy_list) { unsigned long i; pfn = page_to_pfn(page); @@ -3411,7 +3411,7 @@ static void free_unref_page_commit(struct page *page, int migratetype, __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); - list_add(&page->lru, &pcp->lists[pindex]); + list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; /* @@ -3674,8 +3674,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, return NULL; } - page = list_first_entry(list, struct page, lru); - list_del(&page->lru); + page = list_first_entry(list, struct page, pcp_list); + list_del(&page->pcp_list); pcp->count -= 1 << order; } while (check_new_pcp(page, order)); -- cgit v1.2.3 From 5d0a661d808fc8ddc26940b1a12b82ae356f3ae2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:18 +0100 Subject: mm/page_alloc: use only one PCP list for THP-sized allocations The per_cpu_pages is cache-aligned on a standard x86-64 distribution configuration but a later patch will add a new field which would push the structure into the next cache line. Use only one list to store THP-sized pages on the per-cpu list. This assumes that the vast majority of THP-sized allocations are GFP_MOVABLE but even if it was another type, it would not contribute to serious fragmentation that potentially causes a later THP allocation failure. Align per_cpu_pages on the cacheline boundary to ensure there is no false cache sharing. After this patch, the structure sizing is; struct per_cpu_pages { int count; /* 0 4 */ int high; /* 4 4 */ int batch; /* 8 4 */ short int free_factor; /* 12 2 */ short int expire; /* 14 2 */ struct list_head lists[13]; /* 16 208 */ /* size: 256, cachelines: 4, members: 6 */ /* padding: 32 */ } __attribute__((__aligned__(64))); Link: https://lkml.kernel.org/r/20220624125423.6126-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Nicolas Saenz Julienne Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 +++++++---- mm/page_alloc.c | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5da1135e6755..041136b5628a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -355,15 +355,18 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional - * for pageblock size for THP if configured. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list + * for THP which will usually be GFP_MOVABLE. Even if it is another type, + * it should not contribute to serious fragmentation causing THP allocation + * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 1 #else #define NR_PCP_THP 0 #endif -#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) +#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) +#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Shift to encode migratetype and order in the same integer, with order @@ -389,7 +392,7 @@ struct per_cpu_pages { /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; -}; +} ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 78ba5ba66586..b5c340d2cb43 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -653,7 +653,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); - base = PAGE_ALLOC_COSTLY_ORDER + 1; + return NR_LOWORDER_PCP_LISTS; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -667,7 +667,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (pindex == NR_LOWORDER_PCP_LISTS) order = pageblock_order; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); -- cgit v1.2.3 From 4b23a68f953628eb4e4b7fe1294ebf93d4b8ceee Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:21 +0100 Subject: mm/page_alloc: protect PCP lists with a spinlock Currently the PCP lists are protected by using local_lock_irqsave to prevent migration and IRQ reentrancy but this is inconvenient. Remote draining of the lists is impossible and a workqueue is required and every task allocation/free must disable then enable interrupts which is expensive. As preparation for dealing with both of those problems, protect the lists with a spinlock. The IRQ-unsafe version of the lock is used because IRQs are already disabled by local_lock_irqsave. spin_trylock is used in combination with local_lock_irqsave() but later will be replaced with a spin_trylock_irqsave when the local_lock is removed. The per_cpu_pages still fits within the same number of cache lines after this patch relative to before the series. struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int batch; /* 12 4 */ short int free_factor; /* 16 2 */ short int expire; /* 18 2 */ /* XXX 4 bytes hole, try to pack */ struct list_head lists[13]; /* 24 208 */ /* size: 256, cachelines: 4, members: 7 */ /* sum members: 228, holes: 1, sum holes: 4 */ /* padding: 24 */ } __attribute__((__aligned__(64))); There is overhead in the fast path due to acquiring the spinlock even though the spinlock is per-cpu and uncontended in the common case. Page Fault Test (PFT) running on a 1-socket reported the following results on a 1 socket machine. 5.19.0-rc3 5.19.0-rc3 vanilla mm-pcpspinirq-v5r16 Hmean faults/sec-1 869275.7381 ( 0.00%) 874597.5167 * 0.61%* Hmean faults/sec-3 2370266.6681 ( 0.00%) 2379802.0362 * 0.40%* Hmean faults/sec-5 2701099.7019 ( 0.00%) 2664889.7003 * -1.34%* Hmean faults/sec-7 3517170.9157 ( 0.00%) 3491122.8242 * -0.74%* Hmean faults/sec-8 3965729.6187 ( 0.00%) 3939727.0243 * -0.66%* There is a small hit in the number of faults per second but given that the results are more stable, it's borderline noise. [akpm@linux-foundation.org: add missing local_unlock_irqrestore() on contention path] Link: https://lkml.kernel.org/r/20220624125423.6126-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Yu Zhao Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 119 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 99 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 041136b5628a..578247a341b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -382,6 +382,7 @@ enum zone_watermarks { /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { + spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 026c9437456c..a08ec4ac7ef2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -133,6 +133,20 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = { .lock = INIT_LOCAL_LOCK(lock), }; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -3101,15 +3115,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - local_unlock_irqrestore(&pagesets.lock, flags); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif @@ -3122,16 +3143,17 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp, 0); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* @@ -3399,17 +3421,15 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, int migratetype, +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, unsigned int order) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; int high; int pindex; bool free_high; __count_vm_event(PGFREE); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3436,6 +3456,9 @@ static void free_unref_page_commit(struct page *page, int migratetype, void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; unsigned long pfn = page_to_pfn(page); int migratetype; @@ -3459,7 +3482,16 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, migratetype, order); + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = this_cpu_ptr(zone->per_cpu_pageset); + if (spin_trylock(&pcp->lock)) { + free_unref_page_commit(zone, pcp, page, migratetype, order); + spin_unlock(&pcp->lock); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3469,6 +3501,8 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; unsigned long flags; int batch_count = 0; int migratetype; @@ -3495,6 +3529,17 @@ void free_unref_page_list(struct list_head *list) local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) + spin_unlock(&pcp->lock); + locked_zone = zone; + pcp = this_cpu_ptr(zone->per_cpu_pageset); + spin_lock(&pcp->lock); + } + /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3504,18 +3549,24 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, migratetype, 0); + free_unref_page_commit(zone, pcp, page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; local_lock_irqsave(&pagesets.lock, flags); + pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); + spin_lock(&pcp->lock); } } + + if (pcp) + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3729,18 +3780,32 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; local_lock_irqsave(&pagesets.lock, flags); + /* + * spin_trylock may fail due to a parallel drain. In the future, the + * trylock will also protect against IRQ reentrancy. + */ + pcp = this_cpu_ptr(zone->per_cpu_pageset); + pcp_trylock_prepare(UP_flags); + if (!spin_trylock(&pcp->lock)) { + pcp_trylock_finish(UP_flags); + local_unlock_irqrestore(&pagesets.lock, flags); + return NULL; + } + /* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3775,7 +3840,8 @@ struct page *rmqueue(struct zone *preferred_zone, migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); - goto out; + if (likely(page)) + goto out; } } @@ -5260,6 +5326,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, { struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5340,11 +5407,15 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; - /* Attempt the batch allocation */ + /* Is a parallel drain in progress? */ local_lock_irqsave(&pagesets.lock, flags); + pcp_trylock_prepare(UP_flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); - pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; + if (!spin_trylock(&pcp->lock)) + goto failed_irq; + /* Attempt the batch allocation */ + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { /* Skip existing pages */ @@ -5357,8 +5428,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp, pcp_list); if (unlikely(!page)) { /* Try and allocate at least one page */ - if (!nr_account) + if (!nr_account) { + spin_unlock(&pcp->lock); goto failed_irq; + } break; } nr_account++; @@ -5371,6 +5444,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); @@ -5380,6 +5455,7 @@ out: return nr_populated; failed_irq: + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); failed: @@ -7020,6 +7096,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]); -- cgit v1.2.3 From 840532711d7299d7e937952482ec899d4622c452 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:35 +0530 Subject: mm/mmap: build protect protection_map[] with __P000 Patch series "mm/mmap: Drop __SXXX/__PXXX macros from across platforms", v7. __SXXX/__PXXX macros are unnecessary abstraction layer in creating the generic protection_map[] array which is used for vm_get_page_prot(). This abstraction layer can be avoided, if the platforms just define the array protection_map[] for all possible vm_flags access permission combinations and also export vm_get_page_prot() implementation. This series drops __SXXX/__PXXX macros from across platforms in the tree. First it build protects generic protection_map[] array with '#ifdef __P000' and moves it inside platforms which enable ARCH_HAS_VM_GET_PAGE_PROT. Later this build protects same array with '#ifdef ARCH_HAS_VM_GET_PAGE_PROT' and moves inside remaining platforms while enabling ARCH_HAS_VM_GET_PAGE_PROT. This adds a new macro DECLARE_VM_GET_PAGE_PROT defining the current generic vm_get_page_prot(), in order for it to be reused on platforms that do not require custom implementation. Finally, ARCH_HAS_VM_GET_PAGE_PROT can just be dropped, as all platforms now define and export vm_get_page_prot(), via looking up a private and static protection_map[] array. protection_map[] data type has been changed as 'static const' on all platforms that do not change it during boot. This patch (of 26): Build protect generic protection_map[] array with __P000, so that it can be moved inside all the platforms one after the other. Otherwise there will be build failures during this process. CONFIG_ARCH_HAS_VM_GET_PAGE_PROT cannot be used for this purpose as only certain platforms enable this config now. Link: https://lkml.kernel.org/r/20220711070600.2378316-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20220711070600.2378316-2-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Suggested-by: Christophe Leroy Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/mmap.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d4ebfc206e2b..1a435ce146a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,7 +425,9 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ +#ifdef __P000 extern pgprot_t protection_map[16]; +#endif /* * The default fault flags that should be used by most of the diff --git a/mm/mmap.c b/mm/mmap.c index c14d7286a379..def0e03cf25c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -101,6 +101,7 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no * x: (yes) yes */ +#ifdef __P000 pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = __P000, [VM_READ] = __P001, @@ -119,6 +120,7 @@ pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 }; +#endif #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT pgprot_t vm_get_page_prot(unsigned long vm_flags) -- cgit v1.2.3 From 43957b5d11037a651d162f65c682ec3c76777fc8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:36 +0530 Subject: mm/mmap: define DECLARE_VM_GET_PAGE_PROT This just converts the generic vm_get_page_prot() implementation into a new macro i.e DECLARE_VM_GET_PAGE_PROT which later can be used across platforms when enabling them with ARCH_HAS_VM_GET_PAGE_PROT. This does not create any functional change. Link: https://lkml.kernel.org/r/20220711070600.2378316-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Suggested-by: Christoph Hellwig Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 28 ++++++++++++++++++++++++++++ mm/mmap.c | 26 +------------------------- 2 files changed, 29 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3cdc16cfd867..014ee8f0fbaa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1689,4 +1689,32 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE (with Enhanced PAN supported): + * r: (no) no + * w: (no) no + * x: (yes) yes + */ +#define DECLARE_VM_GET_PAGE_PROT \ +pgprot_t vm_get_page_prot(unsigned long vm_flags) \ +{ \ + return protection_map[vm_flags & \ + (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ +} \ +EXPORT_SYMBOL(vm_get_page_prot); + #endif /* _LINUX_PGTABLE_H */ diff --git a/mm/mmap.c b/mm/mmap.c index def0e03cf25c..3c0d65743bc4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -81,26 +81,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* description of effects of mapping type and prot in current implementation. - * this is due to the limited x86 page protection hardware. The expected - * behavior is in parens: - * - * map_type prot - * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC - * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (yes) yes w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (copy) copy w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and - * MAP_PRIVATE (with Enhanced PAN supported): - * r: (no) no - * w: (no) no - * x: (yes) yes - */ #ifdef __P000 pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = __P000, @@ -123,11 +103,7 @@ pgprot_t protection_map[16] __ro_after_init = { #endif #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -pgprot_t vm_get_page_prot(unsigned long vm_flags) -{ - return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; -} -EXPORT_SYMBOL(vm_get_page_prot); +DECLARE_VM_GET_PAGE_PROT #endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) -- cgit v1.2.3 From 09095f74130dfb2110ef2bcdd9ad0d42addaa1d5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:41 +0530 Subject: mm/mmap: build protect protection_map[] with ARCH_HAS_VM_GET_PAGE_PROT Now that protection_map[] has been moved inside those platforms that enable ARCH_HAS_VM_GET_PAGE_PROT. Hence generic protection_map[] array now can be protected with CONFIG_ARCH_HAS_VM_GET_PAGE_PROT intead of __P000. Link: https://lkml.kernel.org/r/20220711070600.2378316-8-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/mmap.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a435ce146a2..4b4dc93f9bc3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,7 +425,7 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -#ifdef __P000 +#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT extern pgprot_t protection_map[16]; #endif diff --git a/mm/mmap.c b/mm/mmap.c index 3c0d65743bc4..2a58a9cd0752 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -81,7 +81,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -#ifdef __P000 +#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = __P000, [VM_READ] = __P001, @@ -100,9 +100,6 @@ pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 }; -#endif - -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT DECLARE_VM_GET_PAGE_PROT #endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ -- cgit v1.2.3 From 3d923c5f1e21ad491acd4c0d62bf2481ce94016c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:36:00 +0530 Subject: mm/mmap: drop ARCH_HAS_VM_GET_PAGE_PROT Now all the platforms enable ARCH_HAS_GET_PAGE_PROT. They define and export own vm_get_page_prot() whether custom or standard DECLARE_VM_GET_PAGE_PROT. Hence there is no need for default generic fallback for vm_get_page_prot(). Just drop this fallback and also ARCH_HAS_GET_PAGE_PROT mechanism. Link: https://lkml.kernel.org/r/20220711070600.2378316-27-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Acked-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/um/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - include/linux/mm.h | 3 --- mm/Kconfig | 3 --- mm/mmap.c | 22 ---------------------- 25 files changed, 50 deletions(-) (limited to 'include') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index db1c8b329461..7d0d26b5b3f5 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -2,7 +2,6 @@ config ALPHA bool default y - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_32BIT_USTAT_F_TINODE select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 8be56a5d8a9b..9e3653253ef2 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -13,7 +13,6 @@ config ARC select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select ARCH_32BIT_OFF_T select BUILDTIME_TABLE_SORT diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e153b6d4fc5b..7630ba9cb6cc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,7 +24,6 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1652a9800ebe..7030bf3f8d6f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -45,7 +45,6 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 588b8a9c68ed..21d72b078eef 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -6,7 +6,6 @@ config CSKY select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace) diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index bc4ceecd0588..54eadf265178 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -6,7 +6,6 @@ config HEXAGON def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_PREEMPT select DMA_GLOBAL_POOL # Other pending projects/to-do items. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 0510a5737711..cb93769a9f2a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -12,7 +12,6 @@ config IA64 select ARCH_HAS_DMA_MARK_CLEAN select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ACPI diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index adf8cf6ec5d5..db2838cf8c02 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -9,7 +9,6 @@ config LOONGARCH select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PTE_SPECIAL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 49aa0cf13e96..936cce42ae9a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -7,7 +7,6 @@ config M68K select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS select ARCH_MIGHT_HAVE_PC_PARPORT if ISA select ARCH_NO_PREEMPT if !COLDFIRE diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 15f91ba8a0c4..8cf429ad1c84 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -7,7 +7,6 @@ config MICROBLAZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index d0b7eb11ec81..db09d45d59ec 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -14,7 +14,6 @@ config MIPS select ARCH_HAS_STRNLEN_USER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_KEEP_MEMBLOCK select ARCH_SUPPORTS_UPROBES diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index e0459dffd218..4167f1eb4cd8 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -6,7 +6,6 @@ config NIOS2 select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_DMA_SET_UNCACHED - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_SWAP select COMMON_CLK select TIMER_OF diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index fe0dfb50eb86..e814df4c483c 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -10,7 +10,6 @@ config OPENRISC select ARCH_HAS_DMA_SET_UNCACHED select ARCH_HAS_DMA_CLEAR_UNCACHED select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select COMMON_CLK select OF select OF_EARLY_FLATTREE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 891d82393957..fa400055b2d5 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -12,7 +12,6 @@ config PARISC select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_PTE_SPECIAL select ARCH_NO_SG_CHAIN select ARCH_SUPPORTS_HUGETLBFS if PA20 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1035d172c7dd..250b8658b2d4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,7 +140,6 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 583389d4e43a..32ffef9f6e5b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -32,7 +32,6 @@ config RISCV select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_STACKWALK diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c4481377ca83..91c0b80a8bf0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -81,7 +81,6 @@ config S390 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_VDSO_DATA - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 91f3ea325388..5f220e903e5a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -12,7 +12,6 @@ config SUPERH select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HIBERNATION_POSSIBLE if MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 09f868613a4d..9c1cce74953a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -13,7 +13,6 @@ config 64BIT config SPARC bool default y - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI select ARCH_MIGHT_HAVE_PC_SERIO select DMA_OPS diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 7fb43654e5b5..4ec22e156a2e 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -10,7 +10,6 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_PREEMPT select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be0b95e51df6..841e4843d0c4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,7 +94,6 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 4c0d83520ff1..0b0f0172cced 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -11,7 +11,6 @@ config XTENSA select ARCH_HAS_DMA_SET_UNCACHED if MMU select ARCH_HAS_STRNCPY_FROM_USER if !KASAN select ARCH_HAS_STRNLEN_USER - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/include/linux/mm.h b/include/linux/mm.h index 4b4dc93f9bc3..61e3101c44ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,9 +425,6 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -extern pgprot_t protection_map[16]; -#endif /* * The default fault flags that should be used by most of the diff --git a/mm/Kconfig b/mm/Kconfig index c1fa4993a56f..56ca0e7c6f9a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -951,9 +951,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. -config ARCH_HAS_VM_GET_PAGE_PROT - bool - config ARCH_HAS_PTE_DEVMAP bool diff --git a/mm/mmap.c b/mm/mmap.c index 2a58a9cd0752..edf27a2789a2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -81,28 +81,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -pgprot_t protection_map[16] __ro_after_init = { - [VM_NONE] = __P000, - [VM_READ] = __P001, - [VM_WRITE] = __P010, - [VM_WRITE | VM_READ] = __P011, - [VM_EXEC] = __P100, - [VM_EXEC | VM_READ] = __P101, - [VM_EXEC | VM_WRITE] = __P110, - [VM_EXEC | VM_WRITE | VM_READ] = __P111, - [VM_SHARED] = __S000, - [VM_SHARED | VM_READ] = __S001, - [VM_SHARED | VM_WRITE] = __S010, - [VM_SHARED | VM_WRITE | VM_READ] = __S011, - [VM_SHARED | VM_EXEC] = __S100, - [VM_SHARED | VM_EXEC | VM_READ] = __S101, - [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, - [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 -}; -DECLARE_VM_GET_PAGE_PROT -#endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ - static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); -- cgit v1.2.3 From 3ce4fee4401206cf5a2c476ec0ee6c90191dfade Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:55 +0800 Subject: mm/huge_memory: check pmd_present first in is_huge_zero_pmd When pmd is non-present, pmd_pfn returns an insane value. So we should check pmd_present first to avoid acquiring such insane value and also avoid touching possible cold huge_zero_pfn cache line when pmd isn't present. Link: https://lkml.kernel.org/r/20220704132201.14611-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ae3d8e2fd9e2..12b297f9951d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -273,7 +273,7 @@ static inline bool is_huge_zero_page(struct page *page) static inline bool is_huge_zero_pmd(pmd_t pmd) { - return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); + return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } static inline bool is_huge_zero_pud(pud_t pud) -- cgit v1.2.3 From 121c1781aeb00475d163246d9ae7d8746e377040 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:58 +0800 Subject: mm/huge_memory: fix comment of page_deferred_list The current comment is confusing because if global or memcg deferred list in the second tail page is occupied by compound_head, why we still use page[2].deferred_list here? I think it wants to say that Global or memcg deferred list in the first tail page is occupied by compound_mapcount and compound_pincount so we use the second tail page's deferred_list instead. Link: https://lkml.kernel.org/r/20220704132201.14611-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 12b297f9951d..37f2f11a6d7e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -294,8 +294,8 @@ static inline bool thp_migration_supported(void) static inline struct list_head *page_deferred_list(struct page *page) { /* - * Global or memcg deferred list in the second tail pages is - * occupied by compound_head. + * See organization of tail pages of compound page in + * "struct page" definition. */ return &page[2].deferred_list; } -- cgit v1.2.3 From dcadcf1c30619ead2f3280bfb7f74de8304be2bb Mon Sep 17 00:00:00 2001 From: Gang Li Date: Wed, 6 Jul 2022 11:46:54 +0800 Subject: mm, hugetlb: skip irrelevant nodes in show_free_areas() show_free_areas() allows to filter out node specific data which is irrelevant to the allocation request. But hugetlb_show_meminfo() still shows hugetlb on all nodes, which is redundant and unnecessary. Use show_mem_node_skip() to skip irrelevant nodes. And replace hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid). before-and-after sample output of OOM: before: ``` [ 214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB [ 214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig [ 214.388334] lowmem_reserve[]: 0 0 0 0 0 [ 214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20 [ 214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [ 214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB ``` after: ``` [ 145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u [ 145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig [ 145.152315] lowmem_reserve[]: 0 0 0 0 0 [ 145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME) [ 145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB ``` Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 18 ++++++++---------- mm/page_alloc.c | 8 ++++++-- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 75ee739d815b..4cdfce976644 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); -void hugetlb_show_meminfo(void); +void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -298,7 +298,7 @@ static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) return 0; } -static inline void hugetlb_show_meminfo(void) +static inline void hugetlb_show_meminfo_node(int nid) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 96635a2874e3..bb763f5d30b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4477,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid) nid, h->surplus_huge_pages_node[nid]); } -void hugetlb_show_meminfo(void) +void hugetlb_show_meminfo_node(int nid) { struct hstate *h; - int nid; if (!hugepages_supported()) return; - for_each_node_state(nid, N_MEMORY) - for_each_hstate(h) - pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", - nid, - h->nr_huge_pages_node[nid], - h->free_huge_pages_node[nid], - h->surplus_huge_pages_node[nid], - huge_page_size(h) / SZ_1K); + for_each_hstate(h) + printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 215b26664ad7..4fa96d3510fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6022,7 +6022,7 @@ static void show_migration_types(unsigned char type) void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; - int cpu; + int cpu, nid; struct zone *zone; pg_data_t *pgdat; @@ -6210,7 +6210,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "= %lukB\n", K(total)); } - hugetlb_show_meminfo(); + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); -- cgit v1.2.3 From 0d8bc0b10aeab543bdccb86180f58db1f79f7cee Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 13 Jul 2022 20:53:14 +0800 Subject: writeback: cleanup bdi_sched_wait() bdi_sched_wait() is no longer used since commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), so remove it. Link: https://lkml.kernel.org/r/20220713125314.171345-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Acked-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d452071db572..e84b745a6811 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -140,12 +140,6 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } -static inline int bdi_sched_wait(void *word) -{ - schedule(); - return 0; -} - #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, -- cgit v1.2.3 From fef3e9066d19230f661048ca86937d954c12cd50 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 14 Jul 2022 16:41:47 +0800 Subject: writeback: remove inode_to_wb_is_valid() inode_to_wb_is_valid() is no longer used since commit fe55d563d417 ("remove inode_congested()"), remove it. Link: https://lkml.kernel.org/r/20220714084147.140324-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e84b745a6811..439815cc1ab9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_to_wb_is_valid - test whether an inode has a wb associated - * @inode: inode of interest - * - * Returns %true if @inode has a wb associated. May be called without any - * locking. - */ -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return inode->i_wb; -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return true; -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; -- cgit v1.2.3 From 73b73bac90d97400e29e585c678c4d0ebfd2680d Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 14 Jul 2022 06:49:18 +0000 Subject: mm: vmpressure: don't count proactive reclaim in vmpressure memory.reclaim is a cgroup v2 interface that allows users to proactively reclaim memory from a memcg, without real memory pressure. Reclaim operations invoke vmpressure, which is used: (a) To notify userspace of reclaim efficiency in cgroup v1, and (b) As a signal for a memcg being under memory pressure for networking (see mem_cgroup_under_socket_pressure()). For (a), vmpressure notifications in v1 are not affected by this change since memory.reclaim is a v2 feature. For (b), the effects of the vmpressure signal (according to Shakeel [1]) are as follows: 1. Reducing send and receive buffers of the current socket. 2. May drop packets on the rx path. 3. May throttle current thread on the tx path. Since proactive reclaim is invoked directly by userspace, not by memory pressure, it makes sense not to throttle networking. Hence, this change makes sure that proactive reclaim caused by memory.reclaim does not trigger vmpressure. [1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/ [yosryahmed@google.com: update documentation] Link: https://lkml.kernel.org/r/20220721173015.2643248-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Cc: Muchun Song Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Miaohe Lin Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 7 +++++++ include/linux/swap.h | 5 ++++- mm/memcontrol.c | 24 ++++++++++++++---------- mm/vmscan.c | 27 +++++++++++++++++---------- 4 files changed, 42 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index ad9ba3ec90a5..376d0207d1f7 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1229,6 +1229,13 @@ PAGE_SIZE multiple when read back. the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. + Please note that the proactive reclaim (triggered by this + interface) is not meant to indicate memory pressure on the + memory cgroup. Therefore socket memory balancing triggered by + the memory reclaim normally is not exercised in this case. + This means that the networking layer will not adapt based on + reclaim induced by memory.reclaim. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index 6d11c51b2b62..ea895b40e6ff 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 767f49a6b987..2b831cc48c7d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2330,7 +2330,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, - gfp_mask, true); + gfp_mask, + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2575,7 +2576,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; unsigned long nr_reclaimed; bool passed_oom = false; - bool may_swap = true; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; bool drained = false; bool raised_max_event = false; unsigned long pflags; @@ -2593,7 +2594,7 @@ retry: mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); - may_swap = false; + reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; } if (batch > nr_pages) { @@ -2621,7 +2622,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, may_swap); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3439,8 +3440,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, continue; } - if (!try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, !memsw)) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3550,7 +3551,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6302,7 +6304,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6351,7 +6353,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, true)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6480,6 +6482,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int reclaim_options; int err; buf = strstrip(buf); @@ -6487,6 +6490,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (err) return err; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6503,7 +6507,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index fbb4108250ee..9e7d8db42918 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -101,6 +101,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at @@ -3180,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->priority); /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } @@ -3305,9 +3309,10 @@ again: } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -3589,8 +3594,9 @@ retry: __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, - sc->priority); + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); @@ -3880,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3893,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = may_swap, + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put -- cgit v1.2.3 From e408e695f5f1f60d784913afc45ff2c387a5aeb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Jul 2022 21:59:12 -0400 Subject: mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs This allows userspace to set flags like FS_APPEND_FL, FS_IMMUTABLE_FL, FS_NODUMP_FL, etc., like all other standard Linux file systems. [akpm@linux-foundation.org: fix CONFIG_TMPFS_XATTR=n warnings] Link: https://lkml.kernel.org/r/20220715015912.2560575-1-tytso@mit.edu Signed-off-by: Theodore Ts'o Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 11 +++++++++ mm/shmem.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a68f982f22d1..1b6c4013f691 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -25,9 +25,20 @@ struct shmem_inode_info { struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ + unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ struct inode vfs_inode; }; +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE +#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ diff --git a/mm/shmem.c b/mm/shmem.c index 12ac67dc831f..06871a913b49 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1058,6 +1059,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); } + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); if (shmem_is_huge(NULL, inode, 0)) @@ -2272,7 +2282,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, +/* Mask out flags that are inappropriate for the given type of inode. */ +static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & SHMEM_REG_FLMASK; + else + return flags & SHMEM_OTHER_FLMASK; +} + +static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; @@ -2297,6 +2318,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + info->fsflags = shmem_mask_flags(mode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -3138,6 +3162,40 @@ static const char *shmem_get_link(struct dentry *dentry, } #ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); + if (info->fsflags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (info->fsflags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + + inode->i_ctime = current_time(inode); + return 0; +} + /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs @@ -3828,6 +3886,8 @@ static const struct inode_operations shmem_inode_operations = { #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif }; @@ -3847,6 +3907,8 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, -- cgit v1.2.3 From bb077c3ffd5362a6d9e60574e1bcc83fe8e3fb27 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 26 Jul 2022 21:18:16 +0800 Subject: mm: cleanup is_highmem() It is unnecessary to add CONFIG_HIGHMEM check in is_highmem(), which has been done in is_highmem_idx(), and move is_highmem() close to is_highmem_idx(). This has no functional impact. Link: https://lkml.kernel.org/r/20220726131816.149075-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 578247a341b2..e24b40c52468 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1137,15 +1137,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void); -#else -static inline bool has_managed_dma(void) -{ - return false; -} -#endif - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -1155,12 +1146,17 @@ static inline bool has_managed_dma(void) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM return is_highmem_idx(zone_idx(zone)); +} + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); #else - return 0; -#endif +static inline bool has_managed_dma(void) +{ + return false; } +#endif /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -- cgit v1.2.3