From 3ddc2fefe6f34ec870fcb22f4cd656e53db079f2 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 22 Jul 2024 18:29:23 +0200 Subject: mm: vmalloc: implement vrealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Align kvrealloc() with krealloc()", v2. Besides the obvious (and desired) difference between krealloc() and kvrealloc(), there is some inconsistency in their function signatures and behavior: - krealloc() frees the memory when the requested size is zero, whereas kvrealloc() simply returns a pointer to the existing allocation. - krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas kvrealloc() does not accept a NULL pointer at all and, if passed, would fault instead. - krealloc() is self-contained, whereas kvrealloc() relies on the caller to provide the size of the previous allocation. Inconsistent behavior throughout allocation APIs is error prone, hence make kvrealloc() behave like krealloc(), which seems superior in all mentioned aspects. In order to be able to get rid of kvrealloc()'s oldsize parameter, introduce vrealloc() and make use of it in kvrealloc(). Making use of vrealloc() in kvrealloc() also provides oppertunities to grow (and shrink) allocations more efficiently. For instance, vrealloc() can be optimized to allocate and map additional pages to grow the allocation or unmap and free unused pages to shrink the allocation. Besides the above, those functions are required by Rust's allocator abstractons [1] (rework based on this series in [2]). With `Vec` or `KVec` respectively, potentially growing (and shrinking) data structures are rather common. [1] https://lore.kernel.org/lkml/20240704170738.3621-1-dakr@redhat.com/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/dakr/linux.git/log/?h=rust/mm This patch (of 2): Implement vrealloc() analogous to krealloc(). Currently, krealloc() requires the caller to pass the size of the previous memory allocation, which, instead, should be self-contained. We attempt to fix this in a subsequent patch which, in order to do so, requires vrealloc(). Besides that, we need realloc() functions for kernel allocators in Rust too. With `Vec` or `KVec` respectively, potentially growing (and shrinking) data structures are rather common. [dakr@kernel.org: fix missing nommu implementation] Link: https://lkml.kernel.org/r/20240725141227.13954-1-dakr@kernel.org [dakr@kernel.org: document concurrency restrictions] Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org [dakr@kernel.org: consider spare memory for __GFP_ZERO] Link: https://lkml.kernel.org/r/20240730185049.6244-3-dakr@kernel.org [dakr@kernel.org: properly document __GFP_ZERO behavior] Link: https://lkml.kernel.org/r/20240730185049.6244-4-dakr@kernel.org Link: https://lkml.kernel.org/r/20240722163111.4766-1-dakr@kernel.org Link: https://lkml.kernel.org/r/20240722163111.4766-2-dakr@kernel.org Signed-off-by: Danilo Krummrich Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Chandan Babu R Cc: Christian König Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Marc Zyngier Cc: Michael Ellerman Cc: Miguel Ojeda Cc: Oliver Upton Cc: Pekka Enberg Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- mm/vmalloc.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af2de36549d6..884e21fed28f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4030,6 +4030,76 @@ void *vzalloc_node_noprof(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node_noprof); +/** + * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and + * @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or vfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory; %NULL if @size is zero or in case of + * failure + */ +void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + size_t old_size = 0; + void *n; + + if (!size) { + vfree(p); + return NULL; + } + + if (p) { + struct vm_struct *vm; + + vm = find_vm_area(p); + if (unlikely(!vm)) { + WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); + return NULL; + } + + old_size = get_vm_area_size(vm); + } + + /* + * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What + * would be a good heuristic for when to shrink the vm_area? + */ + if (size <= old_size) { + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) + memset((void *)p + size, 0, old_size - size); + + return (void *)p; + } + + /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ + n = __vmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + memcpy(n, p, old_size); + vfree(p); + } + + return n; +} + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -- cgit v1.2.3 From 6963f00813f49375360544fe923e62f2070601af Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Wed, 14 Aug 2024 01:12:13 +0800 Subject: mm: vmalloc: add optimization hint on page existence check In commit 21e516b913c1 ("mm: vmalloc: dump page owner info if page is already mapped"), a BUG_ON macro was changed into an if statement, where the compiler optimization hint introduced in the BUG_ON macro was removed along with this change. This patch adds back the hint. Link: https://lkml.kernel.org/r/20240814-fix_vmap_unlikely-v1-1-cd7954775f12@gmail.com Fixes: 21e516b913c1 ("mm: vmalloc: dump page owner info if page is already mapped") Signed-off-by: Miao Wang Cc: Christoph Hellwig Cc: Hariom Panthi Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 884e21fed28f..dfa8f09e7e9d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -105,7 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - if (!pte_none(ptep_get(pte))) { + if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { page = pfn_to_page(pfn); dump_page(page, "remapping already mapped page"); -- cgit v1.2.3 From 7de8728f55ffd6497d4ef590218fad8da7f1af92 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 27 Aug 2024 21:09:16 +0200 Subject: mm: vmalloc: refactor vm_area_alloc_pages() function The aim is to simplify and making the vm_area_alloc_pages() function less confusing as it became more clogged nowadays: - eliminate a "bulk_gfp" variable and do not overwrite a gfp flag for bulk allocator; - drop __GFP_NOFAIL flag for high-order-page requests on upper layer. It becomes less spread between levels when it comes to __GFP_NOFAIL allocations; - add a comment about a fallback path if high-order attempt is unsuccessful because for such cases __GFP_NOFAIL is dropped; - fix a typo in a commit message. Link: https://lkml.kernel.org/r/20240827190916.34242-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dfa8f09e7e9d..264a2627bea4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3515,8 +3515,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; - gfp_t alloc_gfp = gfp; - bool nofail = gfp & __GFP_NOFAIL; struct page *page; int i; @@ -3527,9 +3525,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { - /* bulk allocator doesn't support nofail req. officially */ - gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; - while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -3547,12 +3542,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); - else - nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3566,30 +3560,24 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else if (gfp & __GFP_NOFAIL) { - /* - * Higher order nofail allocations are really expensive and - * potentially dangerous (pre-mature OOM, disruptive reclaim - * and compaction etc. - */ - alloc_gfp &= ~__GFP_NOFAIL; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { - if (!nofail && fatal_signal_pending(current)) + if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) - page = alloc_pages_noprof(alloc_gfp, order); + page = alloc_pages_noprof(gfp, order); else - page = alloc_pages_node_noprof(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, gfp, order); + if (unlikely(!page)) break; /* * Higher order allocations must be able to be treated as - * indepdenent small pages by callers (as they can with + * independent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. @@ -3650,7 +3638,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + /* + * Higher order nofail allocations are really expensive and + * potentially dangerous (pre-mature OOM, disruptive reclaim + * and compaction etc. + * + * Please note, the __vmalloc_node_range_noprof() falls-back + * to order-0 pages if high-order attempt is unsuccessful. + */ + area->nr_pages = vm_area_alloc_pages((page_order ? + gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); -- cgit v1.2.3 From 7ae12a57c56e0b87e6e698479c1f8b65434a608f Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 28 Aug 2024 12:12:16 +0800 Subject: mm/vmalloc.c: make use of the helper macro LIST_HEAD() list_head can be initialized automatically with LIST_HEAD() instead of calling INIT_LIST_HEAD(). Here we can simplify the code. Link: https://lkml.kernel.org/r/20240828041216.1222582-1-lihongbo22@huawei.com Signed-off-by: Hongbo Li Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 264a2627bea4..efba551d81aa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2131,23 +2131,18 @@ reclaim_list_global(struct list_head *head) static void decay_va_pool_node(struct vmap_node *vn, bool full_decay) { + LIST_HEAD(decay_list); + struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - struct list_head decay_list; - struct rb_root decay_root; unsigned long n_decay; int i; - decay_root = RB_ROOT; - INIT_LIST_HEAD(&decay_list); - for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { - struct list_head tmp_list; + LIST_HEAD(tmp_list); if (list_empty(&vn->pool[i].head)) continue; - INIT_LIST_HEAD(&tmp_list); - /* Detach the pool, so no-one can access it. */ spin_lock(&vn->pool_lock); list_replace_init(&vn->pool[i].head, &tmp_list); -- cgit v1.2.3 From b44f71e3fa35e1d8076b73316abff155ef1aa26c Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 6 Sep 2024 18:25:39 +0800 Subject: mm/vmalloc.c: use helper function va_size() Use helper function va_size() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20240906102539.3537207-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index efba551d81aa..cf2af80157bc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; + vm->size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -2018,7 +2018,7 @@ retry: if (vm) { vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; + vm->size = va_size(va); va->vm = vm; } @@ -2192,7 +2192,7 @@ static void purge_vmap_node(struct work_struct *work) vn->nr_purged = 0; list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { - unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = va_size(va) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; unsigned int vn_id = decode_vn_id(va->flags); @@ -2336,8 +2336,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> - PAGE_SHIFT, &vmap_lazy_nr); + nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + &vmap_lazy_nr); /* * If it was request by a certain node we would like to @@ -2933,8 +2933,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (WARN_ON_ONCE(!va)) return; - debug_check_no_locks_freed((void *)va->va_start, - (va->va_end - va->va_start)); + debug_check_no_locks_freed((void *)va->va_start, va_size(va)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -4932,7 +4931,7 @@ static void show_purge_info(struct seq_file *m) list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + va_size(va)); } spin_unlock(&vn->lazy.lock); } @@ -4954,7 +4953,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + va_size(va)); continue; } -- cgit v1.2.3 From 6004fe001d6c88ad8c51e1779989e69694cc9ced Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 6 Sep 2024 11:50:49 +0200 Subject: mm/vmalloc.c: use "high-order" in description non 0-order pages In many places, in the comments, we use both "higher-order" and "high-order" to describe the non 0-order pages. That is confusing, because a "higher-order" statement does not reflect what it is compared with. Link: https://lkml.kernel.org/r/20240906095049.3486-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Suggested-by: Baoquan He Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Michal Hocko Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cf2af80157bc..166e12c90ff6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3570,7 +3570,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, break; /* - * Higher order allocations must be able to be treated as + * High-order allocations must be able to be treated as * independent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, @@ -3633,7 +3633,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, page_order = vm_area_page_order(area); /* - * Higher order nofail allocations are really expensive and + * High-order nofail allocations are really expensive and * potentially dangerous (pre-mature OOM, disruptive reclaim * and compaction etc. * -- cgit v1.2.3