From 188f87f2648b13f5de17d5e068f18d317e0c1f98 Mon Sep 17 00:00:00 2001 From: Eric Chanudet Date: Wed, 22 May 2024 16:38:01 -0400 Subject: mm/mm_init: use node's number of cpus in deferred_page_init_max_threads x86_64 is already using the node's cpu as maximum threads. Make that the default for all archs setting DEFERRED_STRUCT_PAGE_INIT. This returns to the behavior prior making the function arch-specific with commit ecd096506922 ("mm: make deferred init's max threads arch-specific"). Setting DEFERRED_STRUCT_PAGE_INIT and testing on a few arm64 platforms shows faster deferred_init_memmap completions: | | x13s | SA8775p-ride | Ampere R137-P31 | Ampere HR330 | | | Metal, 32GB | VM, 36GB | VM, 58GB | Metal, 128GB | | | 8cpus | 8cpus | 8cpus | 32cpus | |---------|-------------|--------------|-----------------|--------------| | threads | ms (%) | ms (%) | ms (%) | ms (%) | |---------|-------------|--------------|-----------------|--------------| | 1 | 108 (0%) | 72 (0%) | 224 (0%) | 324 (0%) | | cpus | 24 (-77%) | 36 (-50%) | 40 (-82%) | 56 (-82%) | Michael Ellerman reported: : On a machine here (1TB, 40 cores, 4KB pages) the existing code gives: : : [ 0.500124] node 2 deferred pages initialised in 210ms : [ 0.515790] node 3 deferred pages initialised in 230ms : [ 0.516061] node 0 deferred pages initialised in 230ms : [ 0.516522] node 7 deferred pages initialised in 230ms : [ 0.516672] node 4 deferred pages initialised in 230ms : [ 0.516798] node 6 deferred pages initialised in 230ms : [ 0.517051] node 5 deferred pages initialised in 230ms : [ 0.523887] node 1 deferred pages initialised in 240ms : : vs with the patch: : : [ 0.379613] node 0 deferred pages initialised in 90ms : [ 0.380388] node 1 deferred pages initialised in 90ms : [ 0.380540] node 4 deferred pages initialised in 100ms : [ 0.390239] node 6 deferred pages initialised in 100ms : [ 0.390249] node 2 deferred pages initialised in 100ms : [ 0.390786] node 3 deferred pages initialised in 110ms : [ 0.396721] node 5 deferred pages initialised in 110ms : [ 0.397095] node 7 deferred pages initialised in 110ms : : Which is a nice speedup. [echanude@redhat.com: v3] Link: https://lkml.kernel.org/r/20240528185455.643227-4-echanude@redhat.com Link: https://lkml.kernel.org/r/20240522203758.626932-4-echanude@redhat.com Signed-off-by: Eric Chanudet Tested-by: Michael Ellerman (powerpc) Reviewed-by: Baoquan He Acked-by: Alexander Gordeev Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/mm_init.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 3ec04933f7fd..779b4e3d6339 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2122,11 +2122,10 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, } } -/* An arch may override for more concurrency. */ -__weak int __init +static unsigned int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) { - return 1; + return max(cpumask_weight(node_cpumask), 1U); } /* Initialise remaining memory on a node */ -- cgit v1.2.3 From 4f66da89d31ca56d4c41de01dd663f79d697904b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 11 Jun 2024 14:52:23 +0000 Subject: mm/mm_init.c: print mem_init info after defer_init is done Current call flow looks like this: start_kernel mm_core_init mem_init mem_init_print_info rest_init kernel_init kernel_init_freeable page_alloc_init_late deferred_init_memmap If CONFIG_DEFERRED_STRUCT_PAGE_INIT, the time mem_init_print_info() calls, pages are not totally initialized and freed to buddy. This has one issue * nr_free_pages() just contains partial free pages in the system, which is not we expect. Let's print the mem info after defer_init is done. Also this would help changing totalram_pages accounting, since we plan to move the accounting into __free_pages_core(). Link: https://lkml.kernel.org/r/20240611145223.16872-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 779b4e3d6339..1bab27398302 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2321,6 +2321,7 @@ void set_zone_contiguous(struct zone *zone) zone->contiguous = true; } +static void __init mem_init_print_info(void); void __init page_alloc_init_late(void) { struct zone *zone; @@ -2347,6 +2348,8 @@ void __init page_alloc_init_late(void) files_maxfiles_init(); #endif + /* Accounting of total+free memory is stable at this point. */ + mem_init_print_info(); buffer_init(); /* Discard memblock private memory */ @@ -2707,7 +2710,6 @@ void __init mm_core_init(void) kmsan_init_shadow(); stack_depot_early_init(); mem_init(); - mem_init_print_info(); kmem_cache_init(); /* * page_owner must be initialized after buddy is ready, and also after -- cgit v1.2.3 From 15995a35247442aefa0ffe36a6dad51cb46b0918 Mon Sep 17 00:00:00 2001 From: Sourav Panda Date: Wed, 5 Jun 2024 22:27:51 +0000 Subject: mm: report per-page metadata information Today, we do not have any observability of per-page metadata and how much it takes away from the machine capacity. Thus, we want to describe the amount of memory that is going towards per-page metadata, which can vary depending on build configuration, machine architecture, and system use. This patch adds 2 fields to /proc/vmstat that can used as shown below: Accounting per-page metadata allocated by boot-allocator: /proc/vmstat:nr_memmap_boot * PAGE_SIZE Accounting per-page metadata allocated by buddy-allocator: /proc/vmstat:nr_memmap * PAGE_SIZE Accounting total Perpage metadata allocated on the machine: (/proc/vmstat:nr_memmap_boot + /proc/vmstat:nr_memmap) * PAGE_SIZE Utility for userspace: Observability: Describe the amount of memory overhead that is going to per-page metadata on the system at any given time since this overhead is not currently observable. Debugging: Tracking the changes or absolute value in struct pages can help detect anomalies as they can be correlated with other metrics in the machine (e.g., memtotal, number of huge pages, etc). page_ext overheads: Some kernel features such as page_owner page_table_check that use page_ext can be optionally enabled via kernel parameters. Having the total per-page metadata information helps users precisely measure impact. Furthermore, page-metadata metrics will reflect the amount of struct pages reliquished (or overhead reduced) when hugetlbfs pages are reserved which will vary depending on whether hugetlb vmemmap optimization is enabled or not. For background and results see: lore.kernel.org/all/20240220214558.3377482-1-souravpanda@google.com Link: https://lkml.kernel.org/r/20240605222751.1406125-1-souravpanda@google.com Signed-off-by: Sourav Panda Acked-by: David Rientjes Reviewed-by: Pasha Tatashin Cc: Alexey Dobriyan Cc: Bjorn Helgaas Cc: Chen Linxuan Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ivan Babrou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: "Rafael J. Wysocki" Cc: Randy Dunlap Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Tomas Mudrunka Cc: Vlastimil Babka Cc: Wei Xu Cc: Yang Yang Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ include/linux/vmstat.h | 4 ++++ mm/hugetlb_vmemmap.c | 17 +++++++++++++---- mm/mm_init.c | 3 +++ mm/page_alloc.c | 1 + mm/page_ext.c | 32 +++++++++++++++++++++++--------- mm/sparse-vmemmap.c | 8 ++++++++ mm/sparse.c | 7 ++++++- mm/vmstat.c | 26 +++++++++++++++++++++++++- 9 files changed, 85 insertions(+), 15 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 586a8f0104d7..cb7f265c2b96 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,8 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + NR_MEMMAP, /* page metadata allocated through buddy allocator */ + NR_MEMMAP_BOOT, /* page metadata allocated through boot allocator */ NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 735eae6e272c..16b0cfa80502 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -624,4 +624,8 @@ static inline void lruvec_stat_sub_folio(struct folio *folio, { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } + +void __meminit mod_node_early_perpage_metadata(int nid, long delta); +void __meminit store_early_perpage_metadata(void); + #endif /* _LINUX_VMSTAT_H */ diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index b9a55322e52c..fa00d61b6c5a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -184,10 +184,13 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, */ static inline void free_vmemmap_page(struct page *page) { - if (PageReserved(page)) + if (PageReserved(page)) { free_bootmem_page(page); - else + mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1); + } else { __free_page(page); + mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1); + } } /* Free a list of the vmemmap pages */ @@ -338,6 +341,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1); } /* @@ -384,14 +388,19 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; + int i; - while (nr_pages--) { + for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); - if (!page) + if (!page) { + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i); goto out; + } list_add(&page->lru, list); } + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages); + return 0; out: list_for_each_entry_safe(page, next, list, lru) diff --git a/mm/mm_init.c b/mm/mm_init.c index 1bab27398302..b882ae7977ae 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "internal.h" #include "slab.h" #include "shuffle.h" @@ -1618,6 +1619,8 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); pgdat->node_mem_map = map + offset; + mod_node_early_perpage_metadata(pgdat->node_id, + DIV_ROUND_UP(size, PAGE_SIZE)); pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 804c171a3c3a..0576ac081a1f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5738,6 +5738,7 @@ void __init setup_per_cpu_pageset(void) for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); + store_early_perpage_metadata(); } __meminit void zone_pcp_init(struct zone *zone) diff --git a/mm/page_ext.c b/mm/page_ext.c index 95dd8ffeaf81..c191e490c401 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -214,6 +214,8 @@ static int __init alloc_node_page_ext(int nid) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT, + DIV_ROUND_UP(table_size, PAGE_SIZE)); return 0; } @@ -268,12 +270,15 @@ static void *__meminit alloc_page_ext(size_t size, int nid) void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); - if (addr) { + if (addr) kmemleak_alloc(addr, size, 1, flags); - return addr; - } + else + addr = vzalloc_node(size, nid); - addr = vzalloc_node(size, nid); + if (addr) { + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, + DIV_ROUND_UP(size, PAGE_SIZE)); + } return addr; } @@ -316,18 +321,27 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) static void free_page_ext(void *addr) { + size_t table_size; + struct page *page; + struct pglist_data *pgdat; + + table_size = page_ext_size * PAGES_PER_SECTION; + if (is_vmalloc_addr(addr)) { + page = vmalloc_to_page(addr); + pgdat = page_pgdat(page); vfree(addr); } else { - struct page *page = virt_to_page(addr); - size_t table_size; - - table_size = page_ext_size * PAGES_PER_SECTION; - + page = virt_to_page(addr); + pgdat = page_pgdat(page); BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } + + mod_node_page_state(pgdat, NR_MEMMAP, + -1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); + } static void __free_page_ext(unsigned long pfn) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a2cbe44c48e1..1dda6c53370b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -469,5 +469,13 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, if (r < 0) return NULL; + if (system_state == SYSTEM_BOOTING) { + mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(end - start, + PAGE_SIZE)); + } else { + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, + DIV_ROUND_UP(end - start, PAGE_SIZE)); + } + return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 0c64db3f49a6..8724a06d8128 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -14,7 +14,7 @@ #include #include #include - +#include #include "internal.h" #include @@ -465,6 +465,9 @@ static void __init sparse_buffer_init(unsigned long size, int nid) */ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; +#ifndef CONFIG_SPARSEMEM_VMEMMAP + mod_node_early_perpage_metadata(nid, DIV_ROUND_UP(size, PAGE_SIZE)); +#endif } static void __init sparse_buffer_fini(void) @@ -643,6 +646,8 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); + mod_node_page_state(page_pgdat(pfn_to_page(pfn)), NR_MEMMAP, + -1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); vmemmap_free(start, end, altmap); } static void free_map_bootmem(struct page *memmap) diff --git a/mm/vmstat.c b/mm/vmstat.c index 8507c497218b..73d791d1caad 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1255,7 +1255,8 @@ const char * const vmstat_text[] = { "pgdemote_kswapd", "pgdemote_direct", "pgdemote_khugepaged", - + "nr_memmap", + "nr_memmap_boot", /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -2282,4 +2283,27 @@ static int __init extfrag_debug_init(void) } module_init(extfrag_debug_init); + #endif + +/* + * Page metadata size (struct page and page_ext) in pages + */ +static unsigned long early_perpage_metadata[MAX_NUMNODES] __meminitdata; + +void __meminit mod_node_early_perpage_metadata(int nid, long delta) +{ + early_perpage_metadata[nid] += delta; +} + +void __meminit store_early_perpage_metadata(void) +{ + int nid; + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + nid = pgdat->node_id; + mod_node_page_state(NODE_DATA(nid), NR_MEMMAP_BOOT, + early_perpage_metadata[nid]); + } +} -- cgit v1.2.3 From 972b89c1f05f49bac53ea977d20b49330b81b0fa Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 12 Jun 2024 02:04:21 +0000 Subject: mm/mm_init.c: simplify logic of deferred_[init|free]_pages Function deferred_[init|free]_pages are only used in deferred_init_maxorder(), which makes sure the range to init/free is within MAX_ORDER_NR_PAGES size. With this knowledge, we can simplify these two functions. Since * only the first pfn could be IS_MAX_ORDER_ALIGNED() Also since the range passed to deferred_[init|free]_pages is always from memblock.memory for those we have already allocated memmap to cover, pfn_valid() always return true. Then we can remove related check. [richard.weiyang@gmail.com: adjust function declaration indention per David] Link: https://lkml.kernel.org/r/20240613114525.27528-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20240612020421.31975-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Mike Rapoport (IBM) Cc: Kirill A. Shutemov Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/mm_init.c | 64 ++++++++---------------------------------------------------- 1 file changed, 8 insertions(+), 56 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index b882ae7977ae..d3a222b92029 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1916,8 +1916,8 @@ unsigned long __init node_map_pfn_alignment(void) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(unsigned long pfn, - unsigned long nr_pages) +static void __init deferred_free_pages(unsigned long pfn, + unsigned long nr_pages) { struct page *page; unsigned long i; @@ -1955,69 +1955,21 @@ static inline void __init pgdat_init_report_one_done(void) complete(&pgdat_init_all_done_comp); } -/* - * Returns true if page needs to be initialized or freed to buddy allocator. - * - * We check if a current MAX_PAGE_ORDER block is valid by only checking the - * validity of the head pfn. - */ -static inline bool __init deferred_pfn_valid(unsigned long pfn) -{ - if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn)) - return false; - return true; -} - -/* - * Free pages to buddy allocator. Try to free aligned pages in - * MAX_ORDER_NR_PAGES sizes. - */ -static void __init deferred_free_pages(unsigned long pfn, - unsigned long end_pfn) -{ - unsigned long nr_free = 0; - - for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(pfn)) { - deferred_free_range(pfn - nr_free, nr_free); - nr_free = 0; - } else if (IS_MAX_ORDER_ALIGNED(pfn)) { - deferred_free_range(pfn - nr_free, nr_free); - nr_free = 1; - } else { - nr_free++; - } - } - /* Free the last block of pages to allocator */ - deferred_free_range(pfn - nr_free, nr_free); -} - /* * Initialize struct pages. We minimize pfn page lookups and scheduler checks * by performing it only once every MAX_ORDER_NR_PAGES. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(struct zone *zone, - unsigned long pfn, - unsigned long end_pfn) +static unsigned long __init deferred_init_pages(struct zone *zone, + unsigned long pfn, unsigned long end_pfn) { int nid = zone_to_nid(zone); - unsigned long nr_pages = 0; + unsigned long nr_pages = end_pfn - pfn; int zid = zone_idx(zone); - struct page *page = NULL; + struct page *page = pfn_to_page(pfn); - for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(pfn)) { - page = NULL; - continue; - } else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) { - page = pfn_to_page(pfn); - } else { - page++; - } + for (; pfn < end_pfn; pfn++, page++) __init_single_page(page, pfn, zid, nid); - nr_pages++; - } return nr_pages; } @@ -2096,7 +2048,7 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, break; t = min(mo_pfn, epfn); - deferred_free_pages(spfn, t); + deferred_free_pages(spfn, t - spfn); if (mo_pfn <= epfn) break; -- cgit v1.2.3 From 11d5401b011e3557894d824dac210f0b18cc3911 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 29 May 2024 13:19:04 +0200 Subject: mm/mm_init: initialize page->_mapcount directly in __init_single_page() Let's simply reinitialize the page->_mapcount directly. We can now get rid of page_mapcount_reset(). Link: https://lkml.kernel.org/r/20240529111904.2069608-7-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Sergey Senozhatsky [zram/zsmalloc workloads] Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ----- mm/mm_init.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a7f680f0ba..4f75fee497f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,11 +1210,6 @@ static inline int folio_entire_mapcount(const struct folio *folio) return atomic_read(&folio->_entire_mapcount) + 1; } -static inline void page_mapcount_reset(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); diff --git a/mm/mm_init.c b/mm/mm_init.c index d3a222b92029..5d150d412920 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -569,7 +569,7 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); - page_mapcount_reset(page); + atomic_set(&page->_mapcount, -1); page_cpupid_reset_last(page); page_kasan_tag_reset(page); -- cgit v1.2.3 From 13c526540b316937a16946e75d459e011be0ce2e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 11:09:36 +0200 Subject: mm: pass meminit_context to __free_pages_core() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/memory_hotplug: use PageOffline() instead of PageReserved() for !ZONE_DEVICE". This can be a considered a long-overdue follow-up to some parts of [1]. The patches are based on [2], but they are not strictly required -- just makes it clearer why we can use adjust_managed_page_count() for memory hotplug without going into details about highmem. We stop initializing pages with PageReserved() in memory hotplug code -- except when dealing with ZONE_DEVICE for now. Instead, we use PageOffline(): all pages are initialized to PageOffline() when onlining a memory section, and only the ones actually getting exposed to the system/page allocator will get PageOffline cleared. This way, we enlighten memory hotplug more about PageOffline() pages and can cleanup some hacks we have in virtio-mem code. What about ZONE_DEVICE? PageOffline() is wrong, but we might just stop using PageReserved() for them later by simply checking for is_zone_device_page() at suitable places. That will be a separate patch set / proposal. This primarily affects virtio-mem, HV-balloon and XEN balloon. I only briefly tested with virtio-mem, which benefits most from these cleanups. [1] https://lore.kernel.org/all/20191024120938.11237-1-david@redhat.com/ [2] https://lkml.kernel.org/r/20240607083711.62833-1-david@redhat.com This patch (of 3): In preparation for further changes, let's teach __free_pages_core() about the differences of memory hotplug handling. Move the memory hotplug specific handling from generic_online_page() to __free_pages_core(), use adjust_managed_page_count() on the memory hotplug path, and spell out why memory freed via memblock cannot currently use adjust_managed_page_count(). [david@redhat.com: add missed CONFIG_DEFERRED_STRUCT_PAGE_INIT] Link: https://lkml.kernel.org/r/b72e6efd-fb0a-459c-b1a0-88a98e5b19e2@redhat.com [david@redhat.com: fix up the memblock comment, per Oscar] Link: https://lkml.kernel.org/r/2ed64218-7f3b-4302-a5dc-27f060654fe2@redhat.com [david@redhat.com: add the parameter name also in the declaration] Link: https://lkml.kernel.org/r/ca575956-f0dd-4fb9-a307-6b7621681ed9@redhat.com Link: https://lkml.kernel.org/r/20240607090939.89524-1-david@redhat.com Link: https://lkml.kernel.org/r/20240607090939.89524-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alexander Potapenko Cc: Dexuan Cui Cc: Dmitry Vyukov Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Juergen Gross Cc: "K. Y. Srinivasan" Cc: Marco Elver Cc: Michael S. Tsirkin Cc: Mike Rapoport (IBM) Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Stefano Stabellini Cc: Wei Liu Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- mm/internal.h | 3 ++- mm/kmsan/init.c | 2 +- mm/memory_hotplug.c | 9 +-------- mm/mm_init.c | 6 +++--- mm/page_alloc.c | 17 +++++++++++++++-- 5 files changed, 22 insertions(+), 15 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/internal.h b/mm/internal.h index 34217cf4514d..0af4c9885424 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -604,7 +604,8 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); -extern void __free_pages_core(struct page *page, unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order, + enum meminit_context context); /* * This will have no effect, other than possibly generating a warning, if the diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 3ac3b8921d36..ca79636f858e 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -172,7 +172,7 @@ static void do_collection(void) shadow = smallstack_pop(&collect); origin = smallstack_pop(&collect); kmsan_setup_meta(page, shadow, origin, collect.order); - __free_pages_core(page, collect.order); + __free_pages_core(page, collect.order, MEMINIT_EARLY); } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f7580ec80538..e5aafed2b9ce 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -630,14 +630,7 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); void generic_online_page(struct page *page, unsigned int order) { - /* - * Freeing the page with debug_pagealloc enabled will try to unmap it, - * so we should map it first. This is better than introducing a special - * case in page freeing fast path. - */ - debug_pagealloc_map_pages(page, 1 << order); - __free_pages_core(page, order); - totalram_pages_add(1UL << order); + __free_pages_core(page, order, MEMINIT_HOTPLUG); } EXPORT_SYMBOL_GPL(generic_online_page); diff --git a/mm/mm_init.c b/mm/mm_init.c index 5d150d412920..03874f624b32 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1931,7 +1931,7 @@ static void __init deferred_free_pages(unsigned long pfn, if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); - __free_pages_core(page, MAX_PAGE_ORDER); + __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } @@ -1941,7 +1941,7 @@ static void __init deferred_free_pages(unsigned long pfn, for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_core(page, 0); + __free_pages_core(page, 0, MEMINIT_EARLY); } } @@ -2471,7 +2471,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, } } - __free_pages_core(page, order); + __free_pages_core(page, order, MEMINIT_EARLY); } DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d7b4424b645..8c6b4ba62ac1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1219,7 +1219,8 @@ static void __free_pages_ok(struct page *page, unsigned int order, __count_vm_events(PGFREE, 1 << order); } -void __free_pages_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order, + enum meminit_context context) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1239,7 +1240,19 @@ void __free_pages_core(struct page *page, unsigned int order) __ClearPageReserved(p); set_page_count(p, 0); - atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && + unlikely(context == MEMINIT_HOTPLUG)) { + /* + * Freeing the page with debug_pagealloc enabled will try to + * unmap it; some archs don't like double-unmappings, so + * map it first. + */ + debug_pagealloc_map_pages(page, nr_pages); + adjust_managed_page_count(page, nr_pages); + } else { + /* memblock adjusts totalram_pages() manually. */ + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + } if (page_contains_unaccepted(page, order)) { if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) -- cgit v1.2.3 From 503b158fc30f203a1854c87183ca3467c6466001 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 11:09:37 +0200 Subject: mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently initialize the memmap such that PG_reserved is set and the refcount of the page is 1. In virtio-mem code, we have to manually clear that PG_reserved flag to make memory offlining with partially hotplugged memory blocks possible: has_unmovable_pages() would otherwise bail out on such pages. We want to avoid PG_reserved where possible and move to typed pages instead. Further, we want to further enlighten memory offlining code about PG_offline: offline pages in an online memory section. One example is handling managed page count adjustments in a cleaner way during memory offlining. So let's initialize the pages with PG_offline instead of PG_reserved. generic_online_page()->__free_pages_core() will now clear that flag before handing that memory to the buddy. Note that the page refcount is still 1 and would forbid offlining of such memory except when special care is take during GOING_OFFLINE as currently only implemented by virtio-mem. With this change, we can now get non-PageReserved() pages in the XEN balloon list. From what I can tell, that can already happen via decrease_reservation(), so that should be fine. HV-balloon should not really observe a change: partial online memory blocks still cannot get surprise-offlined, because the refcount of these PageOffline() pages is 1. Update virtio-mem, HV-balloon and XEN-balloon code to be aware that hotplugged pages are now PageOffline() instead of PageReserved() before they are handed over to the buddy. We'll leave the ZONE_DEVICE case alone for now. Note that self-hosted vmemmap pages will no longer be marked as reserved. This matches ordinary vmemmap pages allocated from the buddy during memory hotplug. Now, really only vmemmap pages allocated from memblock during early boot will be marked reserved. Existing PageReserved() checks seem to be handling all relevant cases correctly even after this change. Link: https://lkml.kernel.org/r/20240607090939.89524-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador [generic memory-hotplug bits] Cc: Alexander Potapenko Cc: Dexuan Cui Cc: Dmitry Vyukov Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Juergen Gross Cc: "K. Y. Srinivasan" Cc: Marco Elver Cc: Michael S. Tsirkin Cc: Mike Rapoport (IBM) Cc: Oleksandr Tyshchenko Cc: Stefano Stabellini Cc: Wei Liu Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- drivers/hv/hv_balloon.c | 5 ++--- drivers/virtio/virtio_mem.c | 18 ++++++++++++------ drivers/xen/balloon.c | 9 +++++++-- include/linux/page-flags.h | 12 +++++------- mm/memory_hotplug.c | 16 ++++++++++------ mm/mm_init.c | 10 ++++++++-- mm/page_alloc.c | 32 +++++++++++++++++++++++--------- 7 files changed, 67 insertions(+), 35 deletions(-) (limited to 'mm/mm_init.c') diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 0e7427c2baf5..c38dcdfcb914 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -683,9 +683,8 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) if (!PageOffline(pg)) __SetPageOffline(pg); return; - } - if (PageOffline(pg)) - __ClearPageOffline(pg); + } else if (!PageOffline(pg)) + return; /* This frame is currently backed; online the page. */ generic_online_page(pg, 0); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index a3857bacc844..b90df29621c8 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1146,12 +1146,16 @@ static void virtio_mem_set_fake_offline(unsigned long pfn, for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); - __SetPageOffline(page); - if (!onlined) { + if (!onlined) + /* + * Pages that have not been onlined yet were initialized + * to PageOffline(). Remember that we have to route them + * through generic_online_page(). + */ SetPageDirty(page); - /* FIXME: remove after cleanups */ - ClearPageReserved(page); - } + else + __SetPageOffline(page); + VM_WARN_ON_ONCE(!PageOffline(page)); } page_offline_end(); } @@ -1166,9 +1170,11 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); - __ClearPageOffline(page); if (!onlined) + /* generic_online_page() will clear PageOffline(). */ ClearPageDirty(page); + else + __ClearPageOffline(page); } } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index aaf2514fcfa4..528395133b4f 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -146,7 +146,8 @@ static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); /* balloon_append: add the given page to the balloon. */ static void balloon_append(struct page *page) { - __SetPageOffline(page); + if (!PageOffline(page)) + __SetPageOffline(page); /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { @@ -412,7 +413,11 @@ static enum bp_state increase_reservation(unsigned long nr_pages) xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); - /* Relinquish the page back to the allocator. */ + /* + * Relinquish the page back to the allocator. Note that + * some pages, including ones added via xen_online_page(), might + * not be marked reserved; free_reserved_page() will handle that. + */ free_reserved_page(page); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0f7c7320391e..b23772b08cc1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -30,16 +30,11 @@ * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) - * - Pages not added to the page allocator when onlining a section because - * they were excluded via the online_page_callback() or because they are - * PG_hwpoison. * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). - * - Pages part of an offline section (struct pages of offline sections should - * not be trusted as they will be initialized when first onlined). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) @@ -1020,6 +1015,10 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * + * When a memory block gets onlined, all pages are initialized with a + * refcount of 1 and PageOffline(). generic_online_page() will + * take care of clearing PageOffline(). + * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the @@ -1027,8 +1026,7 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy) * pages (now with a reference count of zero) are treated like free pages, * allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will - * require to re-set the pages PageOffline() and not giving them to the - * buddy via online_page_callback_t. + * require not giving them to the buddy via generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e5aafed2b9ce..f9dc9c2c9469 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -734,7 +734,7 @@ static inline void section_taint_zone_device(unsigned long pfn) /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this - * call, all affected pages are PG_reserved. + * call, all affected pages are PageOffline(). * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related @@ -1101,8 +1101,12 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); - for (i = 0; i < nr_pages; i++) - SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + + __ClearPageOffline(page); + SetPageVmemmapSelfHosted(page); + } /* * It might be that the vmemmap_pages fully span sections. If that is @@ -1960,9 +1964,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined * via the hotplug path - online_pages() - as hotplugged memory has - * no holes. This way, we e.g., don't have to worry about marking - * memory holes PG_reserved, don't need pfn_valid() checks, and can - * avoid using walk_system_ram_range() later. + * no holes. This way, we don't have to worry about memory holes, + * don't need pfn_valid() checks, and can avoid using + * walk_system_ram_range() later. */ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, count_system_ram_pages_cb); diff --git a/mm/mm_init.c b/mm/mm_init.c index 03874f624b32..c4bd97d3697f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -893,8 +893,14 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMINIT_HOTPLUG) - __SetPageReserved(page); + if (context == MEMINIT_HOTPLUG) { +#ifdef CONFIG_ZONE_DEVICE + if (zone == ZONE_DEVICE) + __SetPageReserved(page); + else +#endif + __SetPageOffline(page); + } /* * Usually, we want to mark the pageblock MIGRATE_MOVABLE, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c6b4ba62ac1..7bc43cf24e3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1230,18 +1230,23 @@ void __free_pages_core(struct page *page, unsigned int order, * When initializing the memmap, __init_single_page() sets the refcount * of all pages to 1 ("allocated"/"not free"). We have to set the * refcount of all involved pages to 0. + * + * Note that hotplugged memory pages are initialized to PageOffline(). + * Pages freed from memblock might be marked as reserved. */ - prefetchw(p); - for (loop = 0; loop < (nr_pages - 1); loop++, p++) { - prefetchw(p + 1); - __ClearPageReserved(p); - set_page_count(p, 0); - } - __ClearPageReserved(p); - set_page_count(p, 0); - if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(context == MEMINIT_HOTPLUG)) { + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); + VM_WARN_ON_ONCE(PageReserved(p)); + __ClearPageOffline(p); + set_page_count(p, 0); + } + VM_WARN_ON_ONCE(PageReserved(p)); + __ClearPageOffline(p); + set_page_count(p, 0); + /* * Freeing the page with debug_pagealloc enabled will try to * unmap it; some archs don't like double-unmappings, so @@ -1250,6 +1255,15 @@ void __free_pages_core(struct page *page, unsigned int order, debug_pagealloc_map_pages(page, nr_pages); adjust_managed_page_count(page, nr_pages); } else { + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); + } + __ClearPageReserved(p); + set_page_count(p, 0); + /* memblock adjusts totalram_pages() manually. */ atomic_long_add(nr_pages, &page_zone(page)->managed_pages); } -- cgit v1.2.3 From 64e0ba3948ddb99246e8f3c2c34a67be324220db Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 19 Jun 2024 01:06:10 +0000 Subject: mm/mm_init.c: move build check on MAX_ZONELISTS out of ifdef Current check on MAX_ZONELISTS is wrapped in CONFIG_DEBUG_MEMORY_INIT, which may not be triggered all the time. Let's move it out to a more general place. Link: https://lkml.kernel.org/r/20240619010612.20740-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index c4bd97d3697f..1095dcf25ced 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -54,7 +54,6 @@ void __init mminit_verify_zonelist(void) struct zonelist *zonelist; int i, listid, zoneid; - BUILD_BUG_ON(MAX_ZONELISTS > 2); for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { /* Identify the zone and nodelist */ @@ -2657,6 +2656,7 @@ static void __init mem_init_print_info(void) void __init mm_core_init(void) { /* Initializations relying on SMP setup */ + BUILD_BUG_ON(MAX_ZONELISTS > 2); build_all_zonelists(NULL); page_alloc_init_cpuhp(); -- cgit v1.2.3