From 122e093c1734361dedb64f65c99b93e28e4624f4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:33:26 -0700 Subject: mm/page_alloc: fix memory map initialization for descending nodes On systems with memory nodes sorted in descending order, for instance Dell Precision WorkStation T5500, the struct pages for higher PFNs and respectively lower nodes, could be overwritten by the initialization of struct pages corresponding to the holes in the memory sections. For example for the below memory layout [ 0.245624] Early memory node ranges [ 0.248496] node 1: [mem 0x0000000000001000-0x0000000000090fff] [ 0.251376] node 1: [mem 0x0000000000100000-0x00000000dbdf8fff] [ 0.254256] node 1: [mem 0x0000000100000000-0x0000001423ffffff] [ 0.257144] node 0: [mem 0x0000001424000000-0x0000002023ffffff] the range 0x1424000000 - 0x1428000000 in the beginning of node 0 starts in the middle of a section and will be considered as a hole during the initialization of the last section in node 1. The wrong initialization of the memory map causes panic on boot when CONFIG_DEBUG_VM is enabled. Reorder loop order of the memory map initialization so that the outer loop will always iterate over populated memory regions in the ascending order and the inner loop will select the zone corresponding to the PFN range. This way initialization of the struct pages for the memory holes will be always done for the ranges that are actually not populated. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/YNXlMqBbL+tBG7yq@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=213073 Link: https://lkml.kernel.org/r/20210624062305.10940-1-rppt@kernel.org Fixes: 0740a50b9baa ("mm/page_alloc.c: refactor initialization of struct page for holes in memory layout") Signed-off-by: Mike Rapoport Cc: Boris Petkov Cc: Robert Shteynfeld Cc: Baoquan He Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 94 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 36 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef2265f86b91..5b5c9f5813b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6400,7 +6400,7 @@ void __ref memmap_init_zone_device(struct zone *zone, return; /* - * The call to memmap_init_zone should have already taken care + * The call to memmap_init should have already taken care * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ @@ -6465,7 +6465,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during - * memmap_init_zone(). + * memmap_init_zone_range(). * * But, there could be struct pages that correspond to holes in * memblock.memory. This can happen because of the following reasons: @@ -6484,9 +6484,9 @@ static void __meminit zone_init_free_lists(struct zone *zone) * zone/node above the hole except for the trailing pages in the last * section that will be appended to the zone/node below. */ -static u64 __meminit init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) +static void __init init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) { unsigned long pfn; u64 pgcnt = 0; @@ -6502,56 +6502,77 @@ static u64 __meminit init_unavailable_range(unsigned long spfn, pgcnt++; } - return pgcnt; + if (pgcnt) + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + node, zone_names[zone], pgcnt); } #else -static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, - int zone, int node) +static inline void init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) { - return 0; } #endif -void __meminit __weak memmap_init_zone(struct zone *zone) +static void __init memmap_init_zone_range(struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *hole_pfn) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; - int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); - static unsigned long hole_pfn; + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + + if (start_pfn >= end_pfn) + return; + + memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (*hole_pfn < start_pfn) + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); + + *hole_pfn = end_pfn; +} + +static void __init memmap_init(void) +{ unsigned long start_pfn, end_pfn; - u64 pgcnt = 0; + unsigned long hole_pfn = 0; + int i, j, zone_id, nid; - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); - end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + struct pglist_data *node = NODE_DATA(nid); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = node->node_zones + j; - if (end_pfn > start_pfn) - memmap_init_range(end_pfn - start_pfn, nid, - zone_id, start_pfn, zone_end_pfn, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + if (!populated_zone(zone)) + continue; - if (hole_pfn < start_pfn) - pgcnt += init_unavailable_range(hole_pfn, start_pfn, - zone_id, nid); - hole_pfn = end_pfn; + memmap_init_zone_range(zone, start_pfn, end_pfn, + &hole_pfn); + zone_id = j; + } } #ifdef CONFIG_SPARSEMEM /* - * Initialize the hole in the range [zone_end_pfn, section_end]. - * If zone boundary falls in the middle of a section, this hole - * will be re-initialized during the call to this function for the - * higher zone. + * Initialize the memory map for hole in the range [memory_end, + * section_end]. + * Append the pages in this hole to the highest zone in the last + * node. + * The call to init_unavailable_range() is outside the ifdef to + * silence the compiler warining about zone_id set but not used; + * for FLATMEM it is a nop anyway */ - end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); if (hole_pfn < end_pfn) - pgcnt += init_unavailable_range(hole_pfn, end_pfn, - zone_id, nid); #endif - - if (pgcnt) - pr_info(" %s zone: %llu pages in unavailable ranges\n", - zone->name, pgcnt); + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } static int zone_batchsize(struct zone *zone) @@ -7254,7 +7275,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(zone); init_currently_empty_zone(zone, zone->zone_start_pfn, size); - memmap_init_zone(zone); } } @@ -7780,6 +7800,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + + memmap_init(); } static int __init cmdline_parse_core(char *p, unsigned long *core, -- cgit v1.2.3 From ff4b2b4014cbffb3d32b22629252f4dc8616b0fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:33:29 -0700 Subject: mm/page_alloc: correct return value of populated elements if bulk array is populated Dave Jones reported the following This made it into 5.13 final, and completely breaks NFSD for me (Serving tcp v3 mounts). Existing mounts on clients hang, as do new mounts from new clients. Rebooting the server back to rc7 everything recovers. The commit b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") returns the wrong value if the array is already populated which is interpreted as an allocation failure. Dave reported this fixes his problem and it also passed a test running dbench over NFS. Link: https://lkml.kernel.org/r/20210628150219.GC3840@techsingularity.net Fixes: b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") Signed-off-by: Mel Gorman Reported-by: Dave Jones Tested-by: Dave Jones Cc: Dan Carpenter Cc: Jesper Dangaard Brouer Cc: Vlastimil Babka Cc: [5.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b5c9f5813b9..2bf03c76504b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5058,7 +5058,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Already populated array? */ if (unlikely(page_array && nr_pages - nr_populated == 0)) - return 0; + return nr_populated; /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) -- cgit v1.2.3 From d2f07ec052ac1a720d6f1919e3dee7d73f04d495 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:07 -0700 Subject: mm: make __dump_page static Patch series "Constify struct page arguments". While working on various solutions to the 32-bit struct page size regression, one of the problems I found was the networking stack expects to be able to pass const struct page pointers around, and the mm doesn't provide a lot of const-friendly functions to call. The root tangle of problems is that a lot of functions call VM_BUG_ON_PAGE(), which calls dump_page(), which calls a lot of functions which don't take a const struct page (but could be const). This patch (of 6): The only caller of __dump_page() now opencodes dump_page(), so remove it as an externally visible symbol. Link: https://lkml.kernel.org/r/20210416231531.2521383-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210416231531.2521383-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 3 +-- mm/debug.c | 2 +- mm/page_alloc.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 5d0767cb424a..1935d4c72d10 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,8 +9,7 @@ struct page; struct vm_area_struct; struct mm_struct; -extern void dump_page(struct page *page, const char *reason); -extern void __dump_page(struct page *page, const char *reason); +void dump_page(struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); diff --git a/mm/debug.c b/mm/debug.c index 0bdda8407f71..84cdcd0f7bd3 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,7 +42,7 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -void __dump_page(struct page *page, const char *reason) +static void __dump_page(struct page *page, const char *reason) { struct page *head = compound_head(page); struct address_space *mapping; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bf03c76504b..4087340fca32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -658,8 +658,7 @@ static void bad_page(struct page *page, const char *reason) pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - __dump_page(page, reason); - dump_page_owner(page); + dump_page(page, reason); print_modules(); dump_stack(); -- cgit v1.2.3 From 691d9497285a90346a67bfee5cac2007e5e18405 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 28 Jun 2021 19:41:10 -0700 Subject: mm/page_alloc: bail out on fatal signal during reclaim/compaction retry attempt A customer experienced a low-memory situation and decided to issue a SIGKILL (i.e. a fatal signal). Instead of promptly terminating as one would expect, the aforementioned task remained unresponsive. Further investigation indicated that the task was "stuck" in the reclaim/compaction retry loop. Now, it does not make sense to retry compaction when a fatal signal is pending. In the context of try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned; albeit, not every zone, on the zone list, would be considered in the case a fatal signal is found to be pending. Yet, in should_compact_retry(), given the last known compaction result, each zone, on the zone list, can be considered/or checked (see compaction_zonelist_suitable()). For example, if a zone was found to succeed, then reclaim/compaction would be tried again (notwithstanding the above). This patch ensures that compaction is not needlessly retried irrespective of the last known compaction result e.g. if it was skipped, in the unlikely case a fatal signal is found pending. So, OOM is at least attempted. Link: https://lkml.kernel.org/r/20210520142901.3371299-1-atomlin@redhat.com Signed-off-by: Aaron Tomlin Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4087340fca32..ea1efbb06e40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4251,6 +4251,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (!order) return false; + if (fatal_signal_pending(current)) + return false; + if (compaction_made_progress(compact_result)) (*compaction_retries)++; -- cgit v1.2.3 From ca891f41c4c7921a03dfd0fa1faf324393724480 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:22 -0700 Subject: mm: constify get_pfnblock_flags_mask and get_pfnblock_migratetype The struct page is not modified by these routines, so it can be marked const. Link: https://lkml.kernel.org/r/20210416231531.2521383-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pageblock-flags.h | 2 +- mm/page_alloc.c | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index fff52ad370c1..973fd731a520 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -54,7 +54,7 @@ extern unsigned int pageblock_order; /* Forward declaration */ struct page; -unsigned long get_pfnblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea1efbb06e40..4f5eedb6593a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -474,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) #endif /* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, +static inline unsigned long *get_pageblock_bitmap(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM @@ -484,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, #endif /* CONFIG_SPARSEMEM */ } -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); @@ -495,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) } static __always_inline -unsigned long __get_pfnblock_flags_mask(struct page *page, +unsigned long __get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask) { @@ -520,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, * * Return: pageblock_bits flags */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long mask) +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) { return __get_pfnblock_flags_mask(page, pfn, mask); } -static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +static __always_inline int get_pfnblock_migratetype(const struct page *page, + unsigned long pfn) { return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } -- cgit v1.2.3 From 9660ecaa79ce5c068aa3138ca7e29a9402f284ed Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 28 Jun 2021 19:41:31 -0700 Subject: mm/page_alloc: switch to pr_debug Having such debug messages in the dmesg log may confuse users. Therefore restrict debug output to cases where DEBUG is defined or dynamic debugging is enabled for the respective code piece. Link: https://lkml.kernel.org/r/976adb93-3041-ce63-48fc-55a6096a51c1@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f5eedb6593a..902f889a324d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6770,9 +6770,8 @@ static __meminit void zone_pcp_init(struct zone *zone) zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", - zone->name, zone->present_pages, - zone_batchsize(zone)); + pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, + zone->present_pages, zone_batchsize(zone)); } void __meminit init_currently_empty_zone(struct zone *zone, @@ -7042,8 +7041,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, - realtotalpages); + pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } #ifndef CONFIG_SPARSEMEM @@ -7243,9 +7241,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat) if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + pr_debug(" %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); @@ -7254,8 +7251,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { freesize -= dma_reserve; - printk(KERN_DEBUG " %s zone: %lu pages reserved\n", - zone_names[0], dma_reserve); + pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) -- cgit v1.2.3 From 28f836b6777b6f42dce068a40d83a891deaaca37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:38 -0700 Subject: mm/page_alloc: split per cpu page lists and zone stats The PCP (per-cpu page allocator in page_alloc.c) shares locking requirements with vmstat and the zone lock which is inconvenient and causes some issues. For example, the PCP list and vmstat share the same per-cpu space meaning that it's possible that vmstat updates dirty cache lines holding per-cpu lists across CPUs unless padding is used. Second, PREEMPT_RT does not want to disable IRQs for too long in the page allocator. This series splits the locking requirements and uses locks types more suitable for PREEMPT_RT, reduces the time when special locking is required for stats and reduces the time when IRQs need to be disabled on !PREEMPT_RT kernels. Why local_lock? PREEMPT_RT considers the following sequence to be unsafe as documented in Documentation/locking/locktypes.rst local_irq_disable(); spin_lock(&lock); The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save) -> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). While it's possible to separate this out, it generally means there are points where we enable IRQs and reenable them again immediately. To prevent a migration and the per-cpu pointer going stale, migrate_disable is also needed. That is a custom lock that is similar, but worse, than local_lock. Furthermore, on PREEMPT_RT, it's undesirable to leave IRQs disabled for too long. By converting to local_lock which disables migration on PREEMPT_RT, the locking requirements can be separated and start moving the protections for PCP, stats and the zone lock to PREEMPT_RT-safe equivalent locking. As a bonus, local_lock also means that PROVE_LOCKING does something useful. After that, it's obvious that zone_statistics incurs too much overhead and leaves IRQs disabled for longer than necessary on !PREEMPT_RT kernels. zone_statistics uses perfectly accurate counters requiring IRQs be disabled for parallel RMW sequences when inaccurate ones like vm_events would do. The series makes the NUMA statistics (NUMA_HIT and friends) inaccurate counters that then require no special protection on !PREEMPT_RT. The bulk page allocator can then do stat updates in bulk with IRQs enabled which should improve the efficiency. Technically, this could have been done without the local_lock and vmstat conversion work and the order simply reflects the timing of when different series were implemented. Finally, there are places where we conflate IRQs being disabled for the PCP with the IRQ-safe zone spinlock. The remainder of the series reduces the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels. By the end of the series, page_alloc.c does not call local_irq_save so the locking scope is a bit clearer. The one exception is that modifying NR_FREE_PAGES still happens in places where it's known the IRQs are disabled as it's harmless for PREEMPT_RT and would be expensive to split the locking there. No performance data is included because despite the overhead of the stats, it's within the noise for most workloads on !PREEMPT_RT. However, Jesper Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @ 3.60GHz CPU on the first version of this series. Focusing on the array variant of the bulk page allocator reveals the following. (CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz) ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size Baseline Patched 1 56.383 54.225 (+3.83%) 2 40.047 35.492 (+11.38%) 3 37.339 32.643 (+12.58%) 4 35.578 30.992 (+12.89%) 8 33.592 29.606 (+11.87%) 16 32.362 28.532 (+11.85%) 32 31.476 27.728 (+11.91%) 64 30.633 27.252 (+11.04%) 128 30.596 27.090 (+11.46%) While this is a positive outcome, the series is more likely to be interesting to the RT people in terms of getting parts of the PREEMPT_RT tree into mainline. This patch (of 9): The per-cpu page allocator lists and the per-cpu vmstat deltas are stored in the same struct per_cpu_pages even though vmstats have no direct impact on the per-cpu page lists. This is inconsistent because the vmstats for a node are stored on a dedicated structure. The bigger issue is that the per_cpu_pages structure is not cache-aligned and stat updates either cache conflict with adjacent per-cpu lists incurring a runtime cost or padding is required incurring a memory cost. This patch splits the per-cpu pagelists and the vmstat deltas into separate structures. It's mostly a mechanical conversion but some variable renaming is done to clearly distinguish the per-cpu pages structure (pcp) from the vmstats (pzstats). Superficially, this appears to increase the size of the per_cpu_pages structure but the movement of expire fills a structure hole so there is no impact overall. [mgorman@techsingularity.net: make it W=1 cleaner] Link: https://lkml.kernel.org/r/20210514144622.GA3735@techsingularity.net [mgorman@techsingularity.net: make it W=1 even cleaner] Link: https://lkml.kernel.org/r/20210516140705.GB3735@techsingularity.net [lkp@intel.com: check struct per_cpu_zonestat has a non-zero size] [vbabka@suse.cz: Init zone->per_cpu_zonestats properly] Link: https://lkml.kernel.org/r/20210512095458.30632-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210512095458.30632-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Jesper Dangaard Brouer Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 18 +++++----- include/linux/vmstat.h | 8 ++--- mm/page_alloc.c | 85 +++++++++++++++++++++++-------------------- mm/vmstat.c | 98 ++++++++++++++++++++++++++------------------------ 4 files changed, 113 insertions(+), 96 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c2bfefd34b59..a50b123ab7ae 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -341,20 +341,21 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ +#ifdef CONFIG_NUMA + int expire; /* When 0, remote pagesets are drained */ +#endif /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; -struct per_cpu_pageset { - struct per_cpu_pages pcp; -#ifdef CONFIG_NUMA - s8 expire; - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; -#endif +struct per_cpu_zonestat { #ifdef CONFIG_SMP - s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; + s8 stat_threshold; +#endif +#ifdef CONFIG_NUMA + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif }; @@ -484,7 +485,8 @@ struct zone { int node; #endif struct pglist_data *zone_pgdat; - struct per_cpu_pageset __percpu *pageset; + struct per_cpu_pages __percpu *per_cpu_pageset; + struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3299cd69e4ca..0c5f36504613 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct zone *zone, int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]; return x; } @@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; @@ -291,7 +291,7 @@ struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); @@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, - struct per_cpu_pageset *pset) { } + struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 902f889a324d..330c7307a92b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3026,15 +3026,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); + local_irq_restore(flags); } @@ -3133,7 +3132,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { - struct per_cpu_pageset *pcp; + struct per_cpu_pages *pcp; struct zone *z; bool has_pcps = false; @@ -3144,13 +3143,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) */ has_pcps = true; } else if (zone) { - pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + if (pcp->count) has_pcps = true; } else { for_each_populated_zone(z) { - pcp = per_cpu_ptr(z->pageset, cpu); - if (pcp->pcp.count) { + pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); + if (pcp->count) { has_pcps = true; break; } @@ -3280,7 +3279,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) migratetype = MIGRATE_MOVABLE; } - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= READ_ONCE(pcp->high)) @@ -3496,7 +3495,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { @@ -5105,7 +5104,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[ac.migratetype]; while (nr_populated < nr_pages) { @@ -5720,7 +5719,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) continue; for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" @@ -5812,7 +5811,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) free_pcp = 0; for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; show_node(zone); printk(KERN_CONT @@ -5853,7 +5852,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), - K(this_cpu_read(zone->pageset->pcp.count)), + K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -6180,11 +6179,12 @@ static void build_zonelists(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void pageset_init(struct per_cpu_pageset *p); +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); /* These effectively disable the pcplists in the boot pageset completely */ #define BOOT_PAGESET_HIGH 0 #define BOOT_PAGESET_BATCH 1 -static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) @@ -6251,7 +6251,7 @@ build_all_zonelists_init(void) * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) - pageset_init(&per_cpu(boot_pageset, cpu)); + per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); @@ -6650,14 +6650,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, WRITE_ONCE(pcp->high, high); } -static void pageset_init(struct per_cpu_pageset *p) +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { - struct per_cpu_pages *pcp; int migratetype; - memset(p, 0, sizeof(*p)); + memset(pcp, 0, sizeof(*pcp)); + memset(pzstats, 0, sizeof(*pzstats)); - pcp = &p->pcp; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); @@ -6674,12 +6673,12 @@ static void pageset_init(struct per_cpu_pageset *p) static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, unsigned long batch) { - struct per_cpu_pageset *p; + struct per_cpu_pages *pcp; int cpu; for_each_possible_cpu(cpu) { - p = per_cpu_ptr(zone->pageset, cpu); - pageset_update(&p->pcp, high, batch); + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pageset_update(pcp, high, batch); } } @@ -6714,13 +6713,20 @@ static void zone_set_pageset_high_and_batch(struct zone *zone) void __meminit setup_zone_pageset(struct zone *zone) { - struct per_cpu_pageset *p; int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); + /* Size may be 0 on !SMP && !NUMA */ + if (sizeof(struct per_cpu_zonestat) > 0) + zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); + + zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); for_each_possible_cpu(cpu) { - p = per_cpu_ptr(zone->pageset, cpu); - pageset_init(p); + struct per_cpu_pages *pcp; + struct per_cpu_zonestat *pzstats; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + per_cpu_pages_init(pcp, pzstats); } zone_set_pageset_high_and_batch(zone); @@ -6747,9 +6753,9 @@ void __init setup_per_cpu_pageset(void) * the nodes these zones are associated with. */ for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); - memset(pcp->vm_numa_stat_diff, 0, - sizeof(pcp->vm_numa_stat_diff)); + struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); + memset(pzstats->vm_numa_stat_diff, 0, + sizeof(pzstats->vm_numa_stat_diff)); } #endif @@ -6765,7 +6771,8 @@ static __meminit void zone_pcp_init(struct zone *zone) * relies on the ability of the linker to provide the * offset of a (static) per cpu variable into the per cpu area. */ - zone->pageset = &boot_pageset; + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; zone->pageset_high = BOOT_PAGESET_HIGH; zone->pageset_batch = BOOT_PAGESET_BATCH; @@ -9046,15 +9053,17 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { int cpu; - struct per_cpu_pageset *pset; + struct per_cpu_zonestat *pzstats; - if (zone->pageset != &boot_pageset) { + if (zone->per_cpu_pageset != &boot_pageset) { for_each_online_cpu(cpu) { - pset = per_cpu_ptr(zone->pageset, cpu); - drain_zonestat(zone, pset); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + drain_zonestat(zone, pzstats); } - free_percpu(zone->pageset); - zone->pageset = &boot_pageset; + free_percpu(zone->per_cpu_pageset); + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; } } diff --git a/mm/vmstat.c b/mm/vmstat.c index cccee36b289c..f1400ba46beb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -44,7 +44,7 @@ static void zero_zone_numa_counters(struct zone *zone) for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { atomic_long_set(&zone->vm_numa_stat[item], 0); for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item] = 0; } } @@ -266,7 +266,7 @@ void refresh_zone_stat_thresholds(void) for_each_online_cpu(cpu) { int pgdat_threshold; - per_cpu_ptr(zone->pageset, cpu)->stat_threshold + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold = threshold; /* Base nodestat threshold on the largest populated zone. */ @@ -303,7 +303,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, threshold = (*calculate_pressure)(zone); for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold = threshold; } } @@ -316,7 +316,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; long x; long t; @@ -389,7 +389,7 @@ EXPORT_SYMBOL(__mod_node_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; @@ -435,7 +435,7 @@ EXPORT_SYMBOL(__inc_node_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; @@ -495,7 +495,7 @@ EXPORT_SYMBOL(__dec_node_page_state); static inline void mod_zone_state(struct zone *zone, enum zone_stat_item item, long delta, int overstep_mode) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; long o, n, t, z; @@ -781,19 +781,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets) int changes = 0; for_each_populated_zone(zone) { - struct per_cpu_pageset __percpu *p = zone->pageset; + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; +#ifdef CONFIG_NUMA + struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; +#endif for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { int v; - v = this_cpu_xchg(p->vm_stat_diff[i], 0); + v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0); if (v) { atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ - __this_cpu_write(p->expire, 3); + __this_cpu_write(pcp->expire, 3); #endif } } @@ -801,12 +804,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { int v; - v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); + v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0); if (v) { atomic_long_add(v, &zone->vm_numa_stat[i]); global_numa_diff[i] += v; - __this_cpu_write(p->expire, 3); + __this_cpu_write(pcp->expire, 3); } } @@ -819,23 +822,23 @@ static int refresh_cpu_vm_stats(bool do_pagesets) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!__this_cpu_read(p->expire) || - !__this_cpu_read(p->pcp.count)) + if (!__this_cpu_read(pcp->expire) || + !__this_cpu_read(pcp->count)) continue; /* * We never drain zones local to this processor. */ if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); + __this_cpu_write(pcp->expire, 0); continue; } - if (__this_cpu_dec_return(p->expire)) + if (__this_cpu_dec_return(pcp->expire)) continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + if (__this_cpu_read(pcp->count)) { + drain_zone_pages(zone, this_cpu_ptr(pcp)); changes++; } } @@ -882,27 +885,27 @@ void cpu_vm_stats_fold(int cpu) int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { - struct per_cpu_pageset *p; + struct per_cpu_zonestat *pzstats; - p = per_cpu_ptr(zone->pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (p->vm_stat_diff[i]) { + if (pzstats->vm_stat_diff[i]) { int v; - v = p->vm_stat_diff[i]; - p->vm_stat_diff[i] = 0; + v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } #ifdef CONFIG_NUMA for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (p->vm_numa_stat_diff[i]) { + if (pzstats->vm_numa_stat_diff[i]) { int v; - v = p->vm_numa_stat_diff[i]; - p->vm_numa_stat_diff[i] = 0; + v = pzstats->vm_numa_stat_diff[i]; + pzstats->vm_numa_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_numa_stat[i]); global_numa_diff[i] += v; } @@ -936,24 +939,24 @@ void cpu_vm_stats_fold(int cpu) * this is only called if !populated_zone(zone), which implies no other users of * pset->vm_stat_diff[] exist. */ -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { int i; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (pset->vm_stat_diff[i]) { - int v = pset->vm_stat_diff[i]; - pset->vm_stat_diff[i] = 0; + if (pzstats->vm_stat_diff[i]) { + int v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &vm_zone_stat[i]); } #ifdef CONFIG_NUMA for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pset->vm_numa_stat_diff[i]) { - int v = pset->vm_numa_stat_diff[i]; + if (pzstats->vm_numa_stat_diff[i]) { + int v = pzstats->vm_numa_stat_diff[i]; - pset->vm_numa_stat_diff[i] = 0; + pzstats->vm_numa_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_numa_stat[i]); atomic_long_add(v, &vm_numa_stat[i]); } @@ -965,8 +968,8 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) void __inc_numa_state(struct zone *zone, enum numa_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; - u16 __percpu *p = pcp->vm_numa_stat_diff + item; + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; + u16 __percpu *p = pzstats->vm_numa_stat_diff + item; u16 v; v = __this_cpu_inc_return(*p); @@ -1693,21 +1696,23 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { - struct per_cpu_pageset *pageset; + struct per_cpu_pages *pcp; + struct per_cpu_zonestat __maybe_unused *pzstats; - pageset = per_cpu_ptr(zone->pageset, i); + pcp = per_cpu_ptr(zone->per_cpu_pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" "\n high: %i" "\n batch: %i", i, - pageset->pcp.count, - pageset->pcp.high, - pageset->pcp.batch); + pcp->count, + pcp->high, + pcp->batch); #ifdef CONFIG_SMP + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); seq_printf(m, "\n vm stats threshold: %d", - pageset->stat_threshold); + pzstats->stat_threshold); #endif } seq_printf(m, @@ -1927,17 +1932,18 @@ static bool need_update(int cpu) struct zone *zone; for_each_populated_zone(zone) { - struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); struct per_cpu_nodestat *n; + /* * The fast way of checking if there are any vmstat diffs. */ - if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * - sizeof(p->vm_stat_diff[0]))) + if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * + sizeof(pzstats->vm_stat_diff[0]))) return true; #ifdef CONFIG_NUMA - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * - sizeof(p->vm_numa_stat_diff[0]))) + if (memchr_inv(pzstats->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * + sizeof(pzstats->vm_numa_stat_diff[0]))) return true; #endif if (last_pgdat == zone->zone_pgdat) -- cgit v1.2.3 From dbbee9d5cd83f9d0a29639e260516907ceb2ac3d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:41 -0700 Subject: mm/page_alloc: convert per-cpu list protection to local_lock There is a lack of clarity of what exactly local_irq_save/local_irq_restore protects in page_alloc.c . It conflates the protection of per-cpu page allocation structures with per-cpu vmstat deltas. This patch protects the PCP structure using local_lock which for most configurations is identical to IRQ enabling/disabling. The scope of the lock is still wider than it should be but this is decreased later. It is possible for the local_lock to be embedded safely within struct per_cpu_pages but it adds complexity to free_unref_page_list. [akpm@linux-foundation.org: coding style fixes] [mgorman@techsingularity.net: work around a pahole limitation with zero-sized struct pagesets] Link: https://lkml.kernel.org/r/20210526080741.GW30378@techsingularity.net [lkp@intel.com: Make pagesets static] Link: https://lkml.kernel.org/r/20210512095458.30632-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ lib/Kconfig.debug | 3 +++ mm/page_alloc.c | 61 +++++++++++++++++++++++++++++++++++++------------- 3 files changed, 51 insertions(+), 15 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a50b123ab7ae..0d6bb737e5a2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -20,6 +20,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -337,6 +338,7 @@ enum zone_watermarks { #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) +/* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7723f58a9394..deca67d28abb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -313,6 +313,9 @@ config DEBUG_INFO_BTF config PAHOLE_HAS_SPLIT_BTF def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119") +config PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT + def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "122") + config DEBUG_INFO_BTF_MODULES def_bool y depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 330c7307a92b..89872ad5e872 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -122,6 +122,24 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) +struct pagesets { + local_lock_t lock; +#if defined(CONFIG_DEBUG_INFO_BTF) && \ + !defined(CONFIG_DEBUG_LOCK_ALLOC) && \ + !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT) + /* + * pahole 1.21 and earlier gets confused by zero-sized per-CPU + * variables and produces invalid BTF. Ensure that + * sizeof(struct pagesets) != 0 for older versions of pahole. + */ + char __pahole_hack; + #warning "pahole too old to support zero-sized struct pagesets" +#endif +}; +static DEFINE_PER_CPU(struct pagesets, pagesets) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -1453,6 +1471,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, } while (--count && --batch_free && !list_empty(list)); } + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1573,6 +1595,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, return; migratetype = get_pfnblock_migratetype(page, pfn); + + /* + * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock + * and protect vmstat updates. + */ local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype, @@ -2955,6 +2982,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3007,12 +3038,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) unsigned long flags; int to_drain, batch; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } #endif @@ -3028,13 +3059,13 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) unsigned long flags; struct per_cpu_pages *pcp; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3297,9 +3328,9 @@ void free_unref_page(struct page *page) if (!free_unref_page_prepare(page, pfn)) return; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); free_unref_page_commit(page, pfn); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3319,7 +3350,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, pfn); } - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_private(page); @@ -3332,12 +3363,12 @@ void free_unref_page_list(struct list_head *list) * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); } } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3494,7 +3525,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct page *page; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); @@ -3502,7 +3533,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); return page; } @@ -5103,7 +5134,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed; /* Attempt the batch allocation */ - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[ac.migratetype]; @@ -5141,12 +5172,12 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); return nr_populated; failed_irq: - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); -- cgit v1.2.3 From f19298b9516c1a031b34b4147773457e3efe743b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:44 -0700 Subject: mm/vmstat: convert NUMA statistics to basic NUMA counters NUMA statistics are maintained on the zone level for hits, misses, foreign etc but nothing relies on them being perfectly accurate for functional correctness. The counters are used by userspace to get a general overview of a workloads NUMA behaviour but the page allocator incurs a high cost to maintain perfect accuracy similar to what is required for a vmstat like NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to turn off the collection of NUMA statistics like NUMA_HIT. This patch converts NUMA_HIT and friends to be NUMA events with similar accuracy to VM events. There is a possibility that slight errors will be introduced but the overall trend as seen by userspace will be similar. The counters are no longer updated from vmstat_refresh context as it is unnecessary overhead for counters that may never be read by userspace. Note that counters could be maintained at the node level to save space but it would have a user-visible impact due to /proc/zoneinfo. [lkp@intel.com: Fix misplaced closing brace for !CONFIG_NUMA] Link: https://lkml.kernel.org/r/20210512095458.30632-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 18 ++--- include/linux/mmzone.h | 13 ++-- include/linux/vmstat.h | 43 ++++++------ mm/mempolicy.c | 2 +- mm/page_alloc.c | 12 ++-- mm/vmstat.c | 173 +++++++++++++++++++------------------------------ 6 files changed, 113 insertions(+), 148 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 2c36f61d30bc..9db297431b97 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -482,6 +482,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + fold_vm_numa_events(); return sysfs_emit(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -489,12 +490,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_numa_state(dev->id, NUMA_HIT), - sum_zone_numa_state(dev->id, NUMA_MISS), - sum_zone_numa_state(dev->id, NUMA_FOREIGN), - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_numa_state(dev->id, NUMA_LOCAL), - sum_zone_numa_state(dev->id, NUMA_OTHER)); + sum_zone_numa_event_state(dev->id, NUMA_HIT), + sum_zone_numa_event_state(dev->id, NUMA_MISS), + sum_zone_numa_event_state(dev->id, NUMA_FOREIGN), + sum_zone_numa_event_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_numa_event_state(dev->id, NUMA_LOCAL), + sum_zone_numa_event_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL); @@ -512,10 +513,11 @@ static ssize_t node_read_vmstat(struct device *dev, sum_zone_node_page_state(nid, i)); #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + fold_vm_numa_events(); + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) len += sysfs_emit_at(buf, len, "%s %lu\n", numa_stat_name(i), - sum_zone_numa_state(nid, i)); + sum_zone_numa_event_state(nid, i)); #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0d6bb737e5a2..f86018d5e362 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -135,10 +135,10 @@ enum numa_stat_item { NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ - NR_VM_NUMA_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS }; #else -#define NR_VM_NUMA_STAT_ITEMS 0 +#define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { @@ -357,7 +357,12 @@ struct per_cpu_zonestat { s8 stat_threshold; #endif #ifdef CONFIG_NUMA - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; + /* + * Low priority inaccurate counters that are only folded + * on demand. Use a large type to avoid the overhead of + * folding during refresh_cpu_vm_stats. + */ + unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; @@ -623,7 +628,7 @@ struct zone { ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; + atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 0c5f36504613..59748bbbba4c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -138,34 +138,27 @@ static inline void vm_events_fold_cpu(int cpu) * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; -extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; +extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #ifdef CONFIG_NUMA -static inline void zone_numa_state_add(long x, struct zone *zone, - enum numa_stat_item item) +static inline void zone_numa_event_add(long x, struct zone *zone, + enum numa_stat_item item) { - atomic_long_add(x, &zone->vm_numa_stat[item]); - atomic_long_add(x, &vm_numa_stat[item]); + atomic_long_add(x, &zone->vm_numa_event[item]); + atomic_long_add(x, &vm_numa_event[item]); } -static inline unsigned long global_numa_state(enum numa_stat_item item) +static inline unsigned long zone_numa_event_state(struct zone *zone, + enum numa_stat_item item) { - long x = atomic_long_read(&vm_numa_stat[item]); - - return x; + return atomic_long_read(&zone->vm_numa_event[item]); } -static inline unsigned long zone_numa_state_snapshot(struct zone *zone, - enum numa_stat_item item) +static inline unsigned long +global_numa_event_state(enum numa_stat_item item) { - long x = atomic_long_read(&zone->vm_numa_stat[item]); - int cpu; - - for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]; - - return x; + return atomic_long_read(&vm_numa_event[item]); } #endif /* CONFIG_NUMA */ @@ -245,18 +238,22 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } #ifdef CONFIG_NUMA -extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); +extern void __count_numa_event(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); -extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); +extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); +extern void fold_vm_numa_events(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) +static inline void fold_vm_numa_events(void) +{ +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP @@ -428,7 +425,7 @@ static inline const char *numa_stat_name(enum numa_stat_item item) static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS + + NR_VM_NUMA_EVENT_ITEMS + item]; } @@ -440,7 +437,7 @@ static inline const char *lru_list_name(enum lru_list lru) static inline const char *writeback_stat_name(enum writeback_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS + + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + item]; } @@ -449,7 +446,7 @@ static inline const char *writeback_stat_name(enum writeback_stat_item item) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS + + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_WRITEBACK_STAT_ITEMS + item]; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 325771bef5e2..b5d95bf1025d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2150,7 +2150,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; if (page && page_to_nid(page) == nid) { preempt_disable(); - __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } return page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89872ad5e872..4e03109bdae5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3480,12 +3480,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __inc_numa_state(z, NUMA_HIT); + __count_numa_event(z, NUMA_HIT); else { - __inc_numa_state(z, NUMA_MISS); - __inc_numa_state(preferred_zone, NUMA_FOREIGN); + __count_numa_event(z, NUMA_MISS); + __count_numa_event(preferred_zone, NUMA_FOREIGN); } - __inc_numa_state(z, local_stat); + __count_numa_event(z, local_stat); #endif } @@ -6785,8 +6785,8 @@ void __init setup_per_cpu_pageset(void) */ for_each_possible_cpu(cpu) { struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); - memset(pzstats->vm_numa_stat_diff, 0, - sizeof(pzstats->vm_numa_stat_diff)); + memset(pzstats->vm_numa_event, 0, + sizeof(pzstats->vm_numa_event)); } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index f1400ba46beb..0e27b62e487d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -31,8 +31,6 @@ #include "internal.h" -#define NUMA_STATS_THRESHOLD (U16_MAX - 2) - #ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; @@ -41,11 +39,12 @@ static void zero_zone_numa_counters(struct zone *zone) { int item, cpu; - for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { - atomic_long_set(&zone->vm_numa_stat[item], 0); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item] + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_event[item], 0); + for_each_online_cpu(cpu) { + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] = 0; + } } } @@ -63,8 +62,8 @@ static void zero_global_numa_counters(void) { int item; - for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) - atomic_long_set(&vm_numa_stat[item], 0); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + atomic_long_set(&vm_numa_event[item], 0); } static void invalid_numa_statistics(void) @@ -161,10 +160,9 @@ void vm_events_fold_cpu(int cpu) * vm_stat contains the global counters */ atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; -atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); -EXPORT_SYMBOL(vm_numa_stat); EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -706,8 +704,7 @@ EXPORT_SYMBOL(dec_node_page_state); * Fold a differential into the global counters. * Returns the number of counters updated. */ -#ifdef CONFIG_NUMA -static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) +static int fold_diff(int *zone_diff, int *node_diff) { int i; int changes = 0; @@ -718,12 +715,6 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) changes++; } - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (numa_diff[i]) { - atomic_long_add(numa_diff[i], &vm_numa_stat[i]); - changes++; - } - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); @@ -731,26 +722,34 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) } return changes; } -#else -static int fold_diff(int *zone_diff, int *node_diff) + +#ifdef CONFIG_NUMA +static void fold_vm_zone_numa_events(struct zone *zone) { - int i; - int changes = 0; + unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; + int cpu; + enum numa_stat_item item; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (zone_diff[i]) { - atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; - } + for_each_online_cpu(cpu) { + struct per_cpu_zonestat *pzstats; - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - if (node_diff[i]) { - atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); } - return changes; + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_event_add(zone_numa_events[item], zone, item); } -#endif /* CONFIG_NUMA */ + +void fold_vm_numa_events(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + fold_vm_zone_numa_events(zone); +} +#endif /* * Update the zone counters for the current cpu. @@ -774,9 +773,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; -#ifdef CONFIG_NUMA - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; -#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; @@ -801,17 +797,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - int v; - - v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0); - if (v) { - - atomic_long_add(v, &zone->vm_numa_stat[i]); - global_numa_diff[i] += v; - __this_cpu_write(pcp->expire, 3); - } - } if (do_pagesets) { cond_resched(); @@ -859,12 +844,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } -#ifdef CONFIG_NUMA - changes += fold_diff(global_zone_diff, global_numa_diff, - global_node_diff); -#else changes += fold_diff(global_zone_diff, global_node_diff); -#endif return changes; } @@ -879,9 +859,6 @@ void cpu_vm_stats_fold(int cpu) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; -#ifdef CONFIG_NUMA - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; -#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { @@ -889,7 +866,7 @@ void cpu_vm_stats_fold(int cpu) pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (pzstats->vm_stat_diff[i]) { int v; @@ -898,17 +875,17 @@ void cpu_vm_stats_fold(int cpu) atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } - + } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pzstats->vm_numa_stat_diff[i]) { - int v; + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + unsigned long v; - v = pzstats->vm_numa_stat_diff[i]; - pzstats->vm_numa_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_numa_stat[i]); - global_numa_diff[i] += v; + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); } + } #endif } @@ -928,11 +905,7 @@ void cpu_vm_stats_fold(int cpu) } } -#ifdef CONFIG_NUMA - fold_diff(global_zone_diff, global_numa_diff, global_node_diff); -#else fold_diff(global_zone_diff, global_node_diff); -#endif } /* @@ -941,43 +914,37 @@ void cpu_vm_stats_fold(int cpu) */ void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { + unsigned long v; int i; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (pzstats->vm_stat_diff[i]) { - int v = pzstats->vm_stat_diff[i]; + v = pzstats->vm_stat_diff[i]; pzstats->vm_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_stat[i]); - atomic_long_add(v, &vm_zone_stat[i]); + zone_page_state_add(v, zone, i); } + } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pzstats->vm_numa_stat_diff[i]) { - int v = pzstats->vm_numa_stat_diff[i]; - - pzstats->vm_numa_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_numa_stat[i]); - atomic_long_add(v, &vm_numa_stat[i]); + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); } + } #endif } #endif #ifdef CONFIG_NUMA -void __inc_numa_state(struct zone *zone, +/* See __count_vm_event comment on why raw_cpu_inc is used. */ +void __count_numa_event(struct zone *zone, enum numa_stat_item item) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; - u16 __percpu *p = pzstats->vm_numa_stat_diff + item; - u16 v; - - v = __this_cpu_inc_return(*p); - if (unlikely(v > NUMA_STATS_THRESHOLD)) { - zone_numa_state_add(v, zone, item); - __this_cpu_write(*p, 0); - } + raw_cpu_inc(pzstats->vm_numa_event[item]); } /* @@ -998,19 +965,16 @@ unsigned long sum_zone_node_page_state(int node, return count; } -/* - * Determine the per node value of a numa stat item. To avoid deviation, - * the per cpu stat number in vm_numa_stat_diff[] is also included. - */ -unsigned long sum_zone_numa_state(int node, +/* Determine the per node value of a numa stat item. */ +unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; - int i; unsigned long count = 0; + int i; for (i = 0; i < MAX_NR_ZONES; i++) - count += zone_numa_state_snapshot(zones + i, item); + count += zone_numa_event_state(zones + i, item); return count; } @@ -1689,9 +1653,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", numa_stat_name(i), - zone_numa_state_snapshot(zone, i)); + zone_numa_event_state(zone, i)); #endif seq_printf(m, "\n pagesets"); @@ -1745,7 +1709,7 @@ static const struct seq_operations zoneinfo_op = { }; #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ - NR_VM_NUMA_STAT_ITEMS + \ + NR_VM_NUMA_EVENT_ITEMS + \ NR_VM_NODE_STAT_ITEMS + \ NR_VM_WRITEBACK_STAT_ITEMS + \ (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ @@ -1760,6 +1724,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) return NULL; BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); + fold_vm_numa_events(); v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); m->private = v; if (!v) @@ -1769,9 +1734,9 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v += NR_VM_ZONE_STAT_ITEMS; #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - v[i] = global_numa_state(i); - v += NR_VM_NUMA_STAT_ITEMS; + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) + v[i] = global_numa_event_state(i); + v += NR_VM_NUMA_EVENT_ITEMS; #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { @@ -1941,11 +1906,7 @@ static bool need_update(int cpu) if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * sizeof(pzstats->vm_stat_diff[0]))) return true; -#ifdef CONFIG_NUMA - if (memchr_inv(pzstats->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * - sizeof(pzstats->vm_numa_stat_diff[0]))) - return true; -#endif + if (last_pgdat == zone->zone_pgdat) continue; last_pgdat = zone->zone_pgdat; -- cgit v1.2.3 From 3e23060b2d0b7eebf37b3b6043ea68da0ebc0646 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:50 -0700 Subject: mm/page_alloc: batch the accounting updates in the bulk allocator Now that the zone_statistics are simple counters that do not require special protection, the bulk allocator accounting updates can be batch updated without adding too much complexity with protected RMW updates or using xchg. Link: https://lkml.kernel.org/r/20210512095458.30632-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 8 ++++++++ mm/page_alloc.c | 30 +++++++++++++----------------- 2 files changed, 21 insertions(+), 17 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fe32a2210e73..d6a6cf53b127 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -247,6 +247,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item item) raw_cpu_inc(pzstats->vm_numa_event[item]); } +static inline void +__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta) +{ + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; + + raw_cpu_add(pzstats->vm_numa_event[item], delta); +} + extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e03109bdae5..6bb9b87cf7d5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3467,7 +3467,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + long nr_account) { #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; @@ -3480,12 +3481,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __count_numa_event(z, NUMA_HIT); + __count_numa_events(z, NUMA_HIT, nr_account); else { - __count_numa_event(z, NUMA_MISS); - __count_numa_event(preferred_zone, NUMA_FOREIGN); + __count_numa_events(z, NUMA_MISS, nr_account); + __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); } - __count_numa_event(z, local_stat); + __count_numa_events(z, local_stat, nr_account); #endif } @@ -3531,7 +3532,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, 1); } local_unlock_irqrestore(&pagesets.lock, flags); return page; @@ -3592,7 +3593,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, 1); local_irq_restore(flags); out: @@ -5077,7 +5078,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; - int nr_populated = 0; + int nr_populated = 0, nr_account = 0; if (unlikely(nr_pages <= 0)) return 0; @@ -5154,15 +5155,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed_irq; break; } - - /* - * Ideally this would be batched but the best way to do - * that cheaply is to first convert zone_statistics to - * be inaccurate per-cpu counter like vm_events to avoid - * a RMW cycle then do the accounting with IRQs enabled. - */ - __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); - zone_statistics(ac.preferred_zoneref->zone, zone); + nr_account++; prep_new_page(page, 0, gfp, 0); if (page_list) @@ -5172,6 +5165,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); + zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + local_unlock_irqrestore(&pagesets.lock, flags); return nr_populated; -- cgit v1.2.3 From 43c95bcc51e4e7f3e3cbce01515fe429a4cf12a7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:54 -0700 Subject: mm/page_alloc: reduce duration that IRQs are disabled for VM counters IRQs are left disabled for the zone and node VM event counters. This is unnecessary as the affected counters are allowed to race for preemmption and IRQs. This patch reduces the scope of IRQs being disabled via local_[lock|unlock]_irq on !PREEMPT_RT kernels. One __mod_zone_freepage_state is still called with IRQs disabled. While this could be moved out, it's not free on all architectures as some require IRQs to be disabled for mod_zone_page_state on !PREEMPT_RT kernels. Link: https://lkml.kernel.org/r/20210512095458.30632-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6bb9b87cf7d5..161bcda61520 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3530,11 +3530,11 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); } - local_unlock_irqrestore(&pagesets.lock, flags); return page; } @@ -3586,15 +3586,15 @@ struct page *rmqueue(struct zone *preferred_zone, if (!page) page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); - local_irq_restore(flags); out: /* Separate test+clear to avoid unnecessary atomics */ @@ -3607,7 +3607,7 @@ out: return page; failed: - local_irq_restore(flags); + spin_unlock_irqrestore(&zone->lock, flags); return NULL; } @@ -5165,11 +5165,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + local_unlock_irqrestore(&pagesets.lock, flags); + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); - local_unlock_irqrestore(&pagesets.lock, flags); - return nr_populated; failed_irq: -- cgit v1.2.3 From 56f0e661ea8c0178e80048df7166653a51ef2c3d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:57 -0700 Subject: mm/page_alloc: explicitly acquire the zone lock in __free_pages_ok __free_pages_ok() disables IRQs before calling a common helper free_one_page() that acquires the zone lock. This is not safe according to Documentation/locking/locktypes.rst and in this context, IRQ disabling is not protecting a per_cpu_pages structure either or a local_lock would be used. This patch explicitly acquires the lock with spin_lock_irqsave instead of relying on a helper. This removes the last instance of local_irq_save() in page_alloc.c. Link: https://lkml.kernel.org/r/20210512095458.30632-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 161bcda61520..f1a51c163e75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1590,21 +1590,21 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); - /* - * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock - * and protect vmstat updates. - */ - local_irq_save(flags); + spin_lock_irqsave(&zone->lock, flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype, - fpi_flags); - local_irq_restore(flags); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); } void __free_pages_core(struct page *page, unsigned int order) -- cgit v1.2.3 From df1acc856923c0a65c28b588585449106c316b71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:00 -0700 Subject: mm/page_alloc: avoid conflating IRQs disabled with zone->lock Historically when freeing pages, free_one_page() assumed that callers had IRQs disabled and the zone->lock could be acquired with spin_lock(). This confuses the scope of what local_lock_irq is protecting and what zone->lock is protecting in free_unref_page_list in particular. This patch uses spin_lock_irqsave() for the zone->lock in free_one_page() instead of relying on callers to have disabled IRQs. free_unref_page_commit() is changed to only deal with PCP pages protected by the local lock. free_unref_page_list() then first frees isolated pages to the buddy lists with free_one_page() and frees the rest of the pages to the PCP via free_unref_page_commit(). The end result is that free_one_page() is no longer depending on side-effects of local_lock to be correct. Note that this may incur a performance penalty while memory hot-remove is running but that is not a common operation. [lkp@intel.com: Ensure CMA pages get addded to correct pcp list] Link: https://lkml.kernel.org/r/20210512095458.30632-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 75 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f1a51c163e75..dd367e5df8cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1501,13 +1501,15 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype, fpi_t fpi_flags) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -3285,31 +3287,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } -static void free_unref_page_commit(struct page *page, unsigned long pfn) +static void free_unref_page_commit(struct page *page, unsigned long pfn, + int migratetype) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - int migratetype; - migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); - - /* - * We only track unmovable, reclaimable and movable on pcp lists. - * Free ISOLATE pages back to the allocator because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free - * excessively into the page allocator - */ - if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype, - FPI_NONE); - return; - } - migratetype = MIGRATE_MOVABLE; - } - pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; @@ -3324,12 +3308,29 @@ void free_unref_page(struct page *page) { unsigned long flags; unsigned long pfn = page_to_pfn(page); + int migratetype; if (!free_unref_page_prepare(page, pfn)) return; + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Place ISOLATE pages on the isolated list because they are being + * offlined but treat HIGHATOMIC as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + return; + } + migratetype = MIGRATE_MOVABLE; + } + local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3341,22 +3342,44 @@ void free_unref_page_list(struct list_head *list) struct page *page, *next; unsigned long flags, pfn; int batch_count = 0; + int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn)) list_del(&page->lru); + + /* + * Free isolated pages directly to the allocator, see + * comment in free_unref_page. + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, + migratetype, FPI_NONE); + continue; + } + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ + set_pcppage_migratetype(page, MIGRATE_MOVABLE); + } + set_page_private(page, pfn); } local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_private(page); - + pfn = page_private(page); set_page_private(page, 0); + migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype); /* * Guard against excessive IRQ disabled times when we get -- cgit v1.2.3 From 902499937e3a82156dcb5069b6df27640480e204 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:03 -0700 Subject: mm/page_alloc: update PGFREE outside the zone lock in __free_pages_ok VM events do not need explicit protection by disabling IRQs so update the counter with IRQs enabled in __free_pages_ok. Link: https://lkml.kernel.org/r/20210512095458.30632-10-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd367e5df8cb..37ce0c2f3bae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1600,13 +1600,14 @@ static void __free_pages_ok(struct page *page, unsigned int order, migratetype = get_pfnblock_migratetype(page, pfn); spin_lock_irqsave(&zone->lock, flags); - __count_vm_events(PGFREE, 1 << order); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); } void __free_pages_core(struct page *page, unsigned int order) -- cgit v1.2.3 From 151e084af4946344fe0d021f4110b69edaac1e8d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 28 Jun 2021 19:42:06 -0700 Subject: mm: page_alloc: dump migrate-failed pages only at -EBUSY alloc_contig_dump_pages() aims for helping debugging page migration failure by elevated page refcount compared to expected_count. (for the detail, please look at migrate_page_move_mapping) However, -ENOMEM is just the case that system is under memory pressure state, not relevant with page refcount at all. Thus, the dumping page list is not helpful for the debugging point of view. Link: https://lkml.kernel.org/r/YKa2Wyo9xqIErpfa@google.com Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Suren Baghdasaryan Cc: John Dias Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ce0c2f3bae..941a75b9fb5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8800,7 +8800,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, lru_cache_enable(); if (ret < 0) { - alloc_contig_dump_pages(&cc->migratepages); + if (ret == -EBUSY) + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } -- cgit v1.2.3 From bbbecb35a41cb5c63ef78e14cc8b95fa9130bc1a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:09 -0700 Subject: mm/page_alloc: delete vm.percpu_pagelist_fraction Patch series "Calculate pcp->high based on zone sizes and active CPUs", v2. The per-cpu page allocator (PCP) is meant to reduce contention on the zone lock but the sizing of batch and high is archaic and neither takes the zone size into account or the number of CPUs local to a zone. With larger zones and more CPUs per node, the contention is getting worse. Furthermore, the fact that vm.percpu_pagelist_fraction adjusts both batch and high values means that the sysctl can reduce zone lock contention but also increase allocation latencies. This series disassociates pcp->high from pcp->batch and then scales pcp->high based on the size of the local zone with limited impact to reclaim and accounting for active CPUs but leaves pcp->batch static. It also adapts the number of pages that can be on the pcp list based on recent freeing patterns. The motivation is partially to adjust to larger memory sizes but is also driven by the fact that large batches of page freeing via release_pages() often shows zone contention as a major part of the problem. Another is a bug report based on an older kernel where a multi-terabyte process can takes several minutes to exit. A workaround was to use vm.percpu_pagelist_fraction to increase the pcp->high value but testing indicated that a production workload could not use the same values because of an increase in allocation latencies. Unfortunately, I cannot reproduce this test case myself as the multi-terabyte machines are in active use but it should alleviate the problem. The series aims to address both and partially acts as a pre-requisite. pcp only works with order-0 which is useless for SLUB (when using high orders) and THP (unconditionally). To store high-order pages on PCP, the pcp->high values need to be increased first. This patch (of 6): The vm.percpu_pagelist_fraction is used to increase the batch and high limits for the per-cpu page allocator (PCP). The intent behind the sysctl is to reduce zone lock acquisition when allocating/freeing pages but it has a problem. While it can decrease contention, it can also increase latency on the allocation side due to unreasonably large batch sizes. This leads to games where an administrator adjusts percpu_pagelist_fraction on the fly to work around contention and allocation latency problems. This series aims to alleviate the problems with zone lock contention while avoiding the allocation-side latency problems. For the purposes of review, it's easier to remove this sysctl now and reintroduce a similar sysctl later in the series that deals only with pcp->high. Link: https://lkml.kernel.org/r/20210525080119.5455-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 19 ------------ include/linux/mmzone.h | 3 -- kernel/sysctl.c | 8 ----- mm/page_alloc.c | 55 +++------------------------------ 4 files changed, 4 insertions(+), 81 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 586cd4b86428..2fcafccb53a8 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -64,7 +64,6 @@ Currently, these files are in /proc/sys/vm: - overcommit_ratio - page-cluster - panic_on_oom -- percpu_pagelist_fraction - stat_interval - stat_refresh - numa_stat @@ -790,24 +789,6 @@ panic_on_oom=2+kdump gives you very strong tool to investigate why oom happens. You can get snapshot. -percpu_pagelist_fraction -======================== - -This is the fraction of pages at most (high mark pcp->high) in each zone that -are allocated for each per cpu page list. The min value for this is 8. It -means that we don't allow more than 1/8th of pages in each zone to be -allocated in any single per_cpu_pagelist. This entry only changes the value -of hot per cpu pagelists. User can specify a number like 100 to allocate -1/100th of each zone to each per cpu page list. - -The batch value of each per cpu pagelist is also updated as a result. It is -set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) - -The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. If the user writes '0' to this -sysctl, it will revert to this default behavior. - - stat_interval ============= diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f86018d5e362..7937a1d1d166 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1027,15 +1027,12 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d4a78e08f6d8..51213c33171e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2908,14 +2908,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, - { - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941a75b9fb5a..5abf2c1d4c58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -120,7 +120,6 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); -#define MIN_PERCPU_PAGELIST_FRACTION (8) struct pagesets { local_lock_t lock; @@ -193,7 +192,6 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); @@ -6735,22 +6733,15 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h /* * Calculate and set new high and batch values for all per-cpu pagesets of a - * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. + * zone based on the zone's size. */ static void zone_set_pageset_high_and_batch(struct zone *zone) { unsigned long new_high, new_batch; - if (percpu_pagelist_fraction) { - new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; - new_batch = max(1UL, new_high / 4); - if ((new_high / 4) > (PAGE_SHIFT * 8)) - new_batch = PAGE_SHIFT * 8; - } else { - new_batch = zone_batchsize(zone); - new_high = 6 * new_batch; - new_batch = max(1UL, 1 * new_batch); - } + new_batch = zone_batchsize(zone); + new_high = 6 * new_batch; + new_batch = max(1UL, 1 * new_batch); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -8413,44 +8404,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } -/* - * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu - * pagelist can have before it gets flushed back to buddy allocator. - */ -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct zone *zone; - int old_percpu_pagelist_fraction; - int ret; - - mutex_lock(&pcp_batch_high_lock); - old_percpu_pagelist_fraction = percpu_pagelist_fraction; - - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || ret < 0) - goto out; - - /* Sanity checking to avoid pcp imbalance */ - if (percpu_pagelist_fraction && - percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { - percpu_pagelist_fraction = old_percpu_pagelist_fraction; - ret = -EINVAL; - goto out; - } - - /* No change? */ - if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) - goto out; - - for_each_populated_zone(zone) - zone_set_pageset_high_and_batch(zone); -out: - mutex_unlock(&pcp_batch_high_lock); - return ret; -} - #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit v1.2.3 From b92ca18e8ca596f4f3d80c1fe833bc57a1b2458c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:12 -0700 Subject: mm/page_alloc: disassociate the pcp->high from pcp->batch The pcp high watermark is based on the batch size but there is no relationship between them other than it is convenient to use early in boot. This patch takes the first step and bases pcp->high on the zone low watermark split across the number of CPUs local to a zone while the batch size remains the same to avoid increasing allocation latencies. The intent behind the default pcp->high is "set the number of PCP pages such that if they are all full that background reclaim is not started prematurely". Note that in this patch the pcp->high values are adjusted after memory hotplug events, min_free_kbytes adjustments and watermark scale factor adjustments but not CPU hotplug events which is handled later in the series. On a test KVM instance; Before grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 378 batch: 63 After grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 [mgorman@techsingularity.net: fix __setup_per_zone_wmarks for parallel memory hotplug] Link: https://lkml.kernel.org/r/20210528105925.GN30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- mm/page_alloc.c | 62 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 70620d0dd923..974a565797d8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -961,7 +961,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL); - zone_pcp_update(zone); /* Basic onlining is complete, allow allocation of onlined pages. */ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); @@ -974,6 +973,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z */ shuffle_zone(zone); + /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); kswapd_run(nid); @@ -1829,13 +1829,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); adjust_present_page_count(zone, -nr_pages); + /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); - } else - zone_pcp_update(zone); + } node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5abf2c1d4c58..19ec81d403a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2174,14 +2174,6 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); - /* - * The number of managed pages has changed due to the initialisation - * so the pcpu batch and high limits needs to be updated or the limits - * will be artificially small. - */ - for_each_populated_zone(zone) - zone_pcp_update(zone); - /* * We initialized the rest of the deferred pages. Permanently disable * on-demand struct page initialization. @@ -6633,13 +6625,12 @@ static int zone_batchsize(struct zone *zone) int batch; /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. + * The number of pages to batch allocate is either ~0.1% + * of the zone or 1MB, whichever is smaller. The batch + * size is striking a balance between allocation latency + * and zone lock contention. */ - batch = zone_managed_pages(zone) / 1024; - /* But no more than a meg. */ - if (batch * PAGE_SIZE > 1024 * 1024) - batch = (1024 * 1024) / PAGE_SIZE; + batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -6676,6 +6667,34 @@ static int zone_batchsize(struct zone *zone) #endif } +static int zone_highsize(struct zone *zone, int batch) +{ +#ifdef CONFIG_MMU + int high; + int nr_local_cpus; + + /* + * The high value of the pcp is based on the zone low watermark + * so that if they are full then background reclaim will not be + * started prematurely. The value is split across all online CPUs + * local to the zone. Note that early in boot that CPUs may not be + * online yet. + */ + nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))); + high = low_wmark_pages(zone) / nr_local_cpus; + + /* + * Ensure high is at least batch*4. The multiple is based on the + * historical relationship between high and batch. + */ + high = max(high, batch << 2); + + return high; +#else + return 0; +#endif +} + /* * pcp->high and pcp->batch values are related and generally batch is lower * than high. They are also related to pcp->count such that count is lower @@ -6737,11 +6756,10 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h */ static void zone_set_pageset_high_and_batch(struct zone *zone) { - unsigned long new_high, new_batch; + int new_high, new_batch; - new_batch = zone_batchsize(zone); - new_high = 6 * new_batch; - new_batch = max(1UL, 1 * new_batch); + new_batch = max(1, zone_batchsize(zone)); + new_high = zone_highsize(zone, new_batch); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -8222,11 +8240,19 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { + struct zone *zone; static DEFINE_SPINLOCK(lock); spin_lock(&lock); __setup_per_zone_wmarks(); spin_unlock(&lock); + + /* + * The watermark size have changed so update the pcpu batch + * and high limits or the limits may be inappropriate. + */ + for_each_zone(zone) + zone_pcp_update(zone); } /* -- cgit v1.2.3 From 04f8cfeaed0849e702278378bce3867577ca45fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:15 -0700 Subject: mm/page_alloc: adjust pcp->high after CPU hotplug events The PCP high watermark is based on the number of online CPUs so the watermarks must be adjusted during CPU hotplug. At the time of hot-remove, the number of online CPUs is already adjusted but during hot-add, a delta needs to be applied to update PCP to the correct value. After this patch is applied, the high watermarks are adjusted correctly. # grep high: /proc/zoneinfo | tail -1 high: 649 # echo 0 > /sys/devices/system/cpu/cpu4/online # grep high: /proc/zoneinfo | tail -1 high: 664 # echo 1 > /sys/devices/system/cpu/cpu4/online # grep high: /proc/zoneinfo | tail -1 high: 649 Link: https://lkml.kernel.org/r/20210525080119.5455-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 2 +- mm/internal.h | 2 +- mm/page_alloc.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 29 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4a62b3980642..47e13582d9fc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -54,7 +54,7 @@ enum cpuhp_state { CPUHP_MM_MEMCQ_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, - CPUHP_PAGE_ALLOC_DEAD, + CPUHP_PAGE_ALLOC, CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_IOVA_DEAD, diff --git a/mm/internal.h b/mm/internal.h index 2946dfa0f245..18e5fb4d225f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -206,7 +206,7 @@ extern int user_min_free_kbytes; extern void free_unref_page(struct page *page); extern void free_unref_page_list(struct list_head *list); -extern void zone_pcp_update(struct zone *zone); +extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19ec81d403a0..8d196a803820 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6667,7 +6667,7 @@ static int zone_batchsize(struct zone *zone) #endif } -static int zone_highsize(struct zone *zone, int batch) +static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU int high; @@ -6678,9 +6678,10 @@ static int zone_highsize(struct zone *zone, int batch) * so that if they are full then background reclaim will not be * started prematurely. The value is split across all online CPUs * local to the zone. Note that early in boot that CPUs may not be - * online yet. + * online yet and that during CPU hotplug that the cpumask is not + * yet updated when a CPU is being onlined. */ - nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))); + nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; high = low_wmark_pages(zone) / nr_local_cpus; /* @@ -6754,12 +6755,12 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h * Calculate and set new high and batch values for all per-cpu pagesets of a * zone based on the zone's size. */ -static void zone_set_pageset_high_and_batch(struct zone *zone) +static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { int new_high, new_batch; new_batch = max(1, zone_batchsize(zone)); - new_high = zone_highsize(zone, new_batch); + new_high = zone_highsize(zone, new_batch, cpu_online); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -6789,7 +6790,7 @@ void __meminit setup_zone_pageset(struct zone *zone) per_cpu_pages_init(pcp, pzstats); } - zone_set_pageset_high_and_batch(zone); + zone_set_pageset_high_and_batch(zone, 0); } /* @@ -8044,6 +8045,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) static int page_alloc_cpu_dead(unsigned int cpu) { + struct zone *zone; lru_add_drain_cpu(cpu); drain_pages(cpu); @@ -8064,6 +8066,19 @@ static int page_alloc_cpu_dead(unsigned int cpu) * race with what we are doing. */ cpu_vm_stats_fold(cpu); + + for_each_populated_zone(zone) + zone_pcp_update(zone, 0); + + return 0; +} + +static int page_alloc_cpu_online(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zone_pcp_update(zone, 1); return 0; } @@ -8089,8 +8104,9 @@ void __init page_alloc_init(void) hashdist = 0; #endif - ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, - "mm/page_alloc:dead", NULL, + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, + "mm/page_alloc:pcp", + page_alloc_cpu_online, page_alloc_cpu_dead); WARN_ON(ret < 0); } @@ -8252,7 +8268,7 @@ void setup_per_zone_wmarks(void) * and high limits or the limits may be inappropriate. */ for_each_zone(zone) - zone_pcp_update(zone); + zone_pcp_update(zone, 0); } /* @@ -9053,10 +9069,10 @@ EXPORT_SYMBOL(free_contig_range); * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalculated. */ -void __meminit zone_pcp_update(struct zone *zone) +void zone_pcp_update(struct zone *zone, int cpu_online) { mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone); + zone_set_pageset_high_and_batch(zone, cpu_online); mutex_unlock(&pcp_batch_high_lock); } -- cgit v1.2.3 From 3b12e7e97938424de2bb1b95ba0bd6a49bad39f9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:18 -0700 Subject: mm/page_alloc: scale the number of pages that are batch freed When a task is freeing a large number of order-0 pages, it may acquire the zone->lock multiple times freeing pages in batches. This may unnecessarily contend on the zone lock when freeing very large number of pages. This patch adapts the size of the batch based on the recent pattern to scale the batch size for subsequent frees. As the machines I used were not large enough to test this are not large enough to illustrate a problem, a debugging patch shows patterns like the following (slightly editted for clarity) Baseline vanilla kernel time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 With patches time-unmap-7724 [...] free_pcppages_bulk: free 126 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 252 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 504 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 751 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 751 count 814 high 814 Link: https://lkml.kernel.org/r/20210525080119.5455-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7937a1d1d166..0a86b2890a16 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -343,8 +343,9 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ + short free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA - int expire; /* When 0, remote pagesets are drained */ + short expire; /* When 0, remote pagesets are drained */ #endif /* Lists of pages, one per migrate type stored on the pcp-lists */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d196a803820..e1d1825a2611 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3278,18 +3278,47 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +{ + int min_nr_free, max_nr_free; + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* Leave at least pcp->batch pages on the list */ + min_nr_free = batch; + max_nr_free = high - batch; + + /* + * Double the number of pages freed each time there is subsequent + * freeing of pages without any allocation. + */ + batch <<= pcp->free_factor; + if (batch < max_nr_free) + pcp->free_factor++; + batch = clamp(batch, min_nr_free, max_nr_free); + + return batch; +} + static void free_unref_page_commit(struct page *page, unsigned long pfn, int migratetype) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; + int high; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= READ_ONCE(pcp->high)) - free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); + high = READ_ONCE(pcp->high); + if (pcp->count >= high) { + int batch = READ_ONCE(pcp->batch); + + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + } } /* @@ -3541,7 +3570,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; local_lock_irqsave(&pagesets.lock, flags); + + /* + * On allocation, reduce the number of pages that are batch freed. + * See nr_pcp_free() where free_factor is increased for subsequent + * frees. + */ pcp = this_cpu_ptr(zone->per_cpu_pageset); + pcp->free_factor >>= 1; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); @@ -6737,6 +6773,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta */ pcp->high = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; + pcp->free_factor = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, -- cgit v1.2.3 From c49c2c47dab6b8d45022b3fabf0642a0e62e3109 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:21 -0700 Subject: mm/page_alloc: limit the number of pages on PCP lists when reclaim is active When kswapd is active then direct reclaim is potentially active. In either case, it is possible that a zone would be balanced if pages were not trapped on PCP lists. Instead of draining remote pages, simply limit the size of the PCP lists while kswapd is active. Link: https://lkml.kernel.org/r/20210525080119.5455-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 19 ++++++++++++++++++- mm/vmscan.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0a86b2890a16..b2f40d64bc4b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -647,6 +647,7 @@ enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ + ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1d1825a2611..adf35ccfd8e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3302,6 +3302,23 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) return batch; } +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +{ + int high = READ_ONCE(pcp->high); + + if (unlikely(!high)) + return 0; + + if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + return high; + + /* + * If reclaim is active, limit the number of pages that can be + * stored on pcp lists + */ + return min(READ_ONCE(pcp->batch) << 2, high); +} + static void free_unref_page_commit(struct page *page, unsigned long pfn, int migratetype) { @@ -3313,7 +3330,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - high = READ_ONCE(pcp->high); + high = nr_pcp_high(pcp, zone); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); diff --git a/mm/vmscan.c b/mm/vmscan.c index f96d62159720..d7c3cb8688dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3722,6 +3722,38 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, return sc->nr_scanned >= sc->nr_to_reclaim; } +/* Page allocator PCP high watermark is lowered if reclaim is active. */ +static inline void +update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) +{ + int i; + struct zone *zone; + + for (i = 0; i <= highest_zoneidx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + if (active) + set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + else + clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + } +} + +static inline void +set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, true); +} + +static inline void +clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, false); +} + /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is @@ -3774,6 +3806,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) boosted = nr_boost_reclaim; restart: + set_reclaim_active(pgdat, highest_zoneidx); sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; @@ -3907,6 +3940,8 @@ restart: pgdat->kswapd_failures++; out: + clear_reclaim_active(pgdat, highest_zoneidx); + /* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsigned long flags; -- cgit v1.2.3 From 74f44822097c665041010994502b5971d6cd9f04 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:24 -0700 Subject: mm/page_alloc: introduce vm.percpu_pagelist_high_fraction This introduces a new sysctl vm.percpu_pagelist_high_fraction. It is similar to the old vm.percpu_pagelist_fraction. The old sysctl increased both pcp->batch and pcp->high with the higher pcp->high potentially reducing zone->lock contention. However, the higher pcp->batch value also potentially increased allocation latency while the PCP was refilled. This sysctl only adjusts pcp->high so that zone->lock contention is potentially reduced but allocation latency during a PCP refill remains the same. # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=8 # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 35071 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=64 high: 4383 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=0 high: 649 batch: 63 [mgorman@techsingularity.net: fix documentation] Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 21 ++++++++++ include/linux/mmzone.h | 3 ++ kernel/sysctl.c | 8 ++++ mm/page_alloc.c | 69 +++++++++++++++++++++++++++++---- 4 files changed, 94 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 2fcafccb53a8..2da25735a629 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm: - overcommit_ratio - page-cluster - panic_on_oom +- percpu_pagelist_high_fraction - stat_interval - stat_refresh - numa_stat @@ -789,6 +790,26 @@ panic_on_oom=2+kdump gives you very strong tool to investigate why oom happens. You can get snapshot. +percpu_pagelist_high_fraction +============================= + +This is the fraction of pages in each zone that are can be stored to +per-cpu page lists. It is an upper boundary that is divided depending +on the number of online CPUs. The min value for this is 8 which means +that we do not allow more than 1/8th of pages in each zone to be stored +on per-cpu page lists. This entry only changes the value of hot per-cpu +page lists. A user can specify a number like 100 to allocate 1/100th of +each zone between per-cpu lists. + +The batch value of each per-cpu page list remains the same regardless of +the value of the high fraction so allocation latencies are unaffected. + +The initial value is zero. Kernel uses this value to set the high pcp->high +mark based on the low watermark for the zone and the number of local +online CPUs. If the user writes '0' to this sysctl, it will revert to +this default behavior. + + stat_interval ============= diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2f40d64bc4b..7d206ca850c7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1029,12 +1029,15 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +extern int percpu_pagelist_high_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 51213c33171e..69d925f1e5da 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2908,6 +2908,14 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index adf35ccfd8e5..cfc4071310fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -120,6 +120,7 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) struct pagesets { local_lock_t lock; @@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; +int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); @@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) #ifdef CONFIG_MMU int high; int nr_local_cpus; + unsigned long total_pages; + + if (!percpu_pagelist_high_fraction) { + /* + * By default, the high value of the pcp is based on the zone + * low watermark so that if they are full then background + * reclaim will not be started prematurely. + */ + total_pages = low_wmark_pages(zone); + } else { + /* + * If percpu_pagelist_high_fraction is configured, the high + * value is based on a fraction of the managed pages in the + * zone. + */ + total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; + } /* - * The high value of the pcp is based on the zone low watermark - * so that if they are full then background reclaim will not be - * started prematurely. The value is split across all online CPUs - * local to the zone. Note that early in boot that CPUs may not be - * online yet and that during CPU hotplug that the cpumask is not - * yet updated when a CPU is being onlined. + * Split the high value across all online CPUs local to the zone. Note + * that early in boot that CPUs may not be online yet and that during + * CPU hotplug that the cpumask is not yet updated when a CPU is being + * onlined. */ nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; - high = low_wmark_pages(zone) / nr_local_cpus; + high = total_pages / nr_local_cpus; /* * Ensure high is at least batch*4. The multiple is based on the @@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. + */ +int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_pagelist_high_fraction; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_high_fraction && + percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { + percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) + goto out; + + for_each_populated_zone(zone) + zone_set_pageset_high_and_batch(zone, 0); +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit v1.2.3 From e47aa90568de326625b19d7bc872f8d70b0820b0 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Mon, 28 Jun 2021 19:42:30 -0700 Subject: mm/page_alloc: improve memmap_pages dbg msg Make debug message more accurate. Link: https://lkml.kernel.org/r/20210531091908.1738465-6-aisheng.dong@nxp.com Signed-off-by: Dong Aisheng Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfc4071310fb..2a306c34fda7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7383,7 +7383,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) pr_debug(" %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } -- cgit v1.2.3 From f7ec104458e00d27a190348ac3a513f3df3699a4 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 28 Jun 2021 19:42:33 -0700 Subject: mm/page_alloc: fix counting of managed_pages commit f63661566fad ("mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty") clears out zone->lowmem_reserve[] if zone is empty. But when zone is not empty and sysctl_lowmem_reserve_ratio[i] is set to zero, zone_managed_pages(zone) is not counted in the managed_pages either. This is inconsistent with the description of lowmem_reserve, so fix it. Link: https://lkml.kernel.org/r/20210527125707.3760259-1-liushixin2@huawei.com Fixes: f63661566fad ("mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty") Signed-off-by: Liu Shixin Reported-by: yangerkun Reviewed-by: Baoquan He Acked-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a306c34fda7..fc151f6a7dbd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8240,14 +8240,14 @@ static void setup_per_zone_lowmem_reserve(void) unsigned long managed_pages = 0; for (j = i + 1; j < MAX_NR_ZONES; j++) { - if (clear) { - zone->lowmem_reserve[j] = 0; - } else { - struct zone *upper_zone = &pgdat->node_zones[j]; + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); - managed_pages += zone_managed_pages(upper_zone); + if (clear) + zone->lowmem_reserve[j] = 0; + else zone->lowmem_reserve[j] = managed_pages / ratio; - } } } } -- cgit v1.2.3 From 21d02f8f8464e27434f477c73431075197a9f72f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:36 -0700 Subject: mm/page_alloc: move free_the_page Patch series "Allow high order pages to be stored on PCP", v2. The per-cpu page allocator (PCP) only handles order-0 pages. With the series "Use local_lock for pcp protection and reduce stat overhead" and "Calculate pcp->high based on zone sizes and active CPUs", it's now feasible to store high-order pages on PCP lists. This small series allows PCP to store "cheap" orders where cheap is determined by PAGE_ALLOC_COSTLY_ORDER and THP-sized allocations. This patch (of 2): In the next page, free_compount_page is going to use the common helper free_the_page. This patch moves the definition to ease review. No functional change. Link: https://lkml.kernel.org/r/20210603142220.10851-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210603142220.10851-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc151f6a7dbd..58f7a321598f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -687,6 +687,14 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline void free_the_page(struct page *page, unsigned int order) +{ + if (order == 0) /* Via pcp? */ + free_unref_page(page); + else + __free_pages_ok(page, order, FPI_NONE); +} + /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -5349,14 +5357,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (order == 0) /* Via pcp? */ - free_unref_page(page); - else - __free_pages_ok(page, order, FPI_NONE); -} - /** * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). -- cgit v1.2.3 From bb1c50d3967f69f413b333713c2718d48d1ab7ea Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:52 -0700 Subject: mm: remove CONFIG_DISCONTIGMEM There are no architectures that support DISCONTIGMEM left. Remove the configuration option and the dead code it was guarding in the generic memory management code. Link: https://lkml.kernel.org/r/20210608091316.3622-6-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/memory_model.h | 37 ++++--------------------------------- include/linux/mmzone.h | 8 +++++--- mm/Kconfig | 25 +++---------------------- mm/page_alloc.c | 13 ------------- 4 files changed, 12 insertions(+), 71 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 7637fb46ba4f..a2c8ed60233a 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -6,47 +6,18 @@ #ifndef __ASSEMBLY__ +/* + * supports 3 memory models. + */ #if defined(CONFIG_FLATMEM) #ifndef ARCH_PFN_OFFSET #define ARCH_PFN_OFFSET (0UL) #endif -#elif defined(CONFIG_DISCONTIGMEM) - -#ifndef arch_pfn_to_nid -#define arch_pfn_to_nid(pfn) pfn_to_nid(pfn) -#endif - -#ifndef arch_local_page_offset -#define arch_local_page_offset(pfn, nid) \ - ((pfn) - NODE_DATA(nid)->node_start_pfn) -#endif - -#endif /* CONFIG_DISCONTIGMEM */ - -/* - * supports 3 memory models. - */ -#if defined(CONFIG_FLATMEM) - #define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET)) #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) -#elif defined(CONFIG_DISCONTIGMEM) - -#define __pfn_to_page(pfn) \ -({ unsigned long __pfn = (pfn); \ - unsigned long __nid = arch_pfn_to_nid(__pfn); \ - NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\ -}) - -#define __page_to_pfn(pg) \ -({ const struct page *__pg = (pg); \ - struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg)); \ - (unsigned long)(__pg - __pgdat->node_mem_map) + \ - __pgdat->node_start_pfn; \ -}) #elif defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -70,7 +41,7 @@ struct mem_section *__sec = __pfn_to_section(__pfn); \ __section_mem_map_addr(__sec) + __pfn; \ }) -#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ +#endif /* CONFIG_FLATMEM/SPARSEMEM */ /* * Convert a physical address to a Page Frame Number and back diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e62e8ef68b5..6f9829562af2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -749,10 +749,12 @@ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +/* + * The array of struct pages for flatmem. + * It must be declared for SPARSEMEM as well because there are configurations + * that rely on that. + */ extern struct page *mem_map; -#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { diff --git a/mm/Kconfig b/mm/Kconfig index 02d44e3420f5..218b96ccc84a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -19,7 +19,7 @@ choice config FLATMEM_MANUAL bool "Flat Memory" - depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE + depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE help This option is best suited for non-NUMA systems with flat address space. The FLATMEM is the most efficient @@ -32,21 +32,6 @@ config FLATMEM_MANUAL If unsure, choose this option (Flat Memory) over any other. -config DISCONTIGMEM_MANUAL - bool "Discontiguous Memory" - depends on ARCH_DISCONTIGMEM_ENABLE - help - This option provides enhanced support for discontiguous - memory systems, over FLATMEM. These systems have holes - in their physical address spaces, and this option provides - more efficient handling of these holes. - - Although "Discontiguous Memory" is still used by several - architectures, it is considered deprecated in favor of - "Sparse Memory". - - If unsure, choose "Sparse Memory" over this option. - config SPARSEMEM_MANUAL bool "Sparse Memory" depends on ARCH_SPARSEMEM_ENABLE @@ -62,17 +47,13 @@ config SPARSEMEM_MANUAL endchoice -config DISCONTIGMEM - def_bool y - depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL - config SPARSEMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL config FLATMEM def_bool y - depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL + depends on !SPARSEMEM || FLATMEM_MANUAL config FLAT_NODE_MEM_MAP def_bool y @@ -85,7 +66,7 @@ config FLAT_NODE_MEM_MAP # config NEED_MULTIPLE_NODES def_bool y - depends on DISCONTIGMEM || NUMA + depends on NUMA # # SPARSEMEM_EXTREME (which is the default) does some bootmem diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58f7a321598f..8926f3fd3bcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -349,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -#ifdef CONFIG_DISCONTIGMEM -/* - * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges - * are not on separate NUMA nodes. Functionally this works but with - * watermark_boost_factor, it can reclaim prematurely as the ranges can be - * quite small. By default, do not boost watermarks on discontigmem as in - * many cases very high-order allocations like THP are likely to be - * unsupported and the premature reclaim offsets the advantage of long-term - * fragmentation avoidance. - */ -int watermark_boost_factor __read_mostly; -#else int watermark_boost_factor __read_mostly = 15000; -#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; -- cgit v1.2.3 From a9ee6cf5c60ed1070e786e53665f9b2f23f2bd11 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:01 -0700 Subject: mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA After removal of DISCINTIGMEM the NEED_MULTIPLE_NODES and NUMA configuration options are equivalent. Drop CONFIG_NEED_MULTIPLE_NODES and use CONFIG_NUMA instead. Done with $ sed -i 's/CONFIG_NEED_MULTIPLE_NODES/CONFIG_NUMA/' \ $(git grep -wl CONFIG_NEED_MULTIPLE_NODES) $ sed -i 's/NEED_MULTIPLE_NODES/NUMA/' \ $(git grep -wl NEED_MULTIPLE_NODES) with manual tweaks afterwards. [rppt@linux.ibm.com: fix arm boot crash] Link: https://lkml.kernel.org/r/YMj9vHhHOiCVN4BF@linux.ibm.com Link: https://lkml.kernel.org/r/20210608091316.3622-9-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/ia64/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/mips/include/asm/mmzone.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/mips/mm/init.c | 4 ++-- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/mmzone.h | 4 ++-- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/kexec/core.c | 4 ++-- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mem.c | 4 ++-- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/include/asm/mmzone.h | 4 ++-- arch/sh/kernel/topology.c | 2 +- arch/sh/mm/Kconfig | 2 +- arch/sh/mm/init.c | 2 +- arch/sparc/Kconfig | 2 +- arch/sparc/include/asm/mmzone.h | 4 ++-- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/mm/init_64.c | 12 ++++++------ arch/x86/Kconfig | 2 +- arch/x86/kernel/setup_percpu.c | 6 +++--- arch/x86/mm/init_32.c | 4 ++-- include/asm-generic/topology.h | 2 +- include/linux/memblock.h | 6 +++--- include/linux/mm.h | 4 ++-- include/linux/mmzone.h | 6 +++--- kernel/crash_core.c | 2 +- mm/Kconfig | 9 --------- mm/memblock.c | 8 ++++---- mm/memory.c | 3 +-- mm/page_alloc.c | 6 +++--- mm/sparse.c | 2 +- 36 files changed, 59 insertions(+), 69 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f1d8566bbf9..d01a1545ab8f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1035,7 +1035,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 default "4" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 279252e3e0f7..da22a35e6f03 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -302,7 +302,7 @@ config NODES_SHIFT int "Max num nodes shift(3-10)" range 3 10 default "10" - depends on NEED_MULTIPLE_NODES + depends on NUMA help This option specifies the maximum number of nodes in your SSI system. MAX_NUMNODES will be 2^(This value). diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ed51970c08e7..4704a16c2e44 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2867,7 +2867,7 @@ config RANDOMIZE_BASE_MAX_OFFSET config NODES_SHIFT int default "6" - depends on NEED_MULTIPLE_NODES + depends on NUMA config HW_PERF_EVENTS bool "Enable hardware performance counter support for perf events" diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index 7649ab45e80c..602a21aee9d4 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA # include #endif diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 195ff4e9771f..96bc798c1ec1 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -239,7 +239,7 @@ static inline int pfn_valid(unsigned long pfn) /* pfn_valid is defined in linux/mmzone.h */ -#elif defined(CONFIG_NEED_MULTIPLE_NODES) +#elif defined(CONFIG_NUMA) #define pfn_valid(pfn) \ ({ \ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 97f6ca341448..19347dc6bbf8 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -394,7 +394,7 @@ void maar_init(void) } } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -473,7 +473,7 @@ void __init mem_init(void) 0x80000000 - 4, KCORE_TEXT); #endif } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) { diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 088dd2afcfe4..14b132cf95e2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -671,7 +671,7 @@ config NODES_SHIFT int default "8" if PPC64 default "4" - depends on NEED_MULTIPLE_NODES + depends on NUMA config USE_PERCPU_NUMA_NODE_ID def_bool y diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 6cda76b57c5d..4c6c6dbd182f 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -18,7 +18,7 @@ * flags field of the struct page */ -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA extern struct pglist_data *node_data[]; /* @@ -41,7 +41,7 @@ u64 memory_hotplug_max(void); #else #define memory_hotplug_max() memblock_end_of_DRAM() -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ #ifdef CONFIG_FA_DUMP #define __HAVE_ARCH_RESERVED_KERNEL_PAGES #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e42b85e4f1aa..a35fbf4d0bce 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -788,7 +788,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2e05c783440a..a5209ea3859e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1047,7 +1047,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * numa_node_id() works after this. */ diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 56da5eb2b923..48525e8b5730 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -68,11 +68,11 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index c3df3a8501d4..2ffcf540f08b 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,7 +13,7 @@ obj-y := fault.o mem.o pgtable.o mmap.o maccess.o \ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ -obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index a6b36a40897a..c5e520c6f13b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -127,7 +127,7 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, } #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init mem_topology_setup(void) { max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; @@ -162,7 +162,7 @@ static int __init mark_nonram_nosave(void) return 0; } -#else /* CONFIG_NEED_MULTIPLE_NODES */ +#else /* CONFIG_NUMA */ static int __init mark_nonram_nosave(void) { return 0; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 18ec0f9bb8d5..15f9490a7aad 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -332,7 +332,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 default "2" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b4c7c34069f8..707afbcd81c2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -475,7 +475,7 @@ config NUMA config NODES_SHIFT int - depends on NEED_MULTIPLE_NODES + depends on NUMA default "1" config SCHED_SMT diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h index 6552a088dc97..7b8dead2723d 100644 --- a/arch/sh/include/asm/mmzone.h +++ b/arch/sh/include/asm/mmzone.h @@ -2,7 +2,7 @@ #ifndef __ASM_SH_MMZONE_H #define __ASM_SH_MMZONE_H -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA #include extern struct pglist_data *node_data[]; @@ -31,7 +31,7 @@ static inline void setup_bootmem_node(int nid, unsigned long start, unsigned long end) { } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ /* Platform specific mem init */ void __init plat_mem_setup(void); diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c index 7a989eed3b18..76af6db9daa2 100644 --- a/arch/sh/kernel/topology.c +++ b/arch/sh/kernel/topology.c @@ -46,7 +46,7 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA for_each_online_node(i) register_one_node(i); #endif diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index d551a9cac41e..ba569cfb4368 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -120,7 +120,7 @@ config NODES_SHIFT int default "3" if CPU_SUBTYPE_SHX3 default "1" - depends on NEED_MULTIPLE_NODES + depends on NUMA config ARCH_FLATMEM_ENABLE def_bool y diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 168d7d4dd735..ce26c7f8950a 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -211,7 +211,7 @@ void __init allocate_pgdat(unsigned int nid) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA NODE_DATA(nid) = memblock_alloc_try_nid( sizeof(struct pglist_data), SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 164a5254c91c..c72f52c704cd 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -265,7 +265,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 4 5 if SPARC64 default "5" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h index 6543fb97a849..a236d8aa893a 100644 --- a/arch/sparc/include/asm/mmzone.h +++ b/arch/sparc/include/asm/mmzone.h @@ -2,7 +2,7 @@ #ifndef _SPARC64_MMZONE_H #define _SPARC64_MMZONE_H -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA #include @@ -13,6 +13,6 @@ extern struct pglist_data *node_data[]; extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ #endif /* _SPARC64_MMZONE_H */ diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e38d8bf454e8..c89a5971fb0d 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1546,7 +1546,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = cpu_to_node(cpu); void *ptr; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index e454f179cf5d..06e938d03f3b 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -903,7 +903,7 @@ struct node_mem_mask { static struct node_mem_mask node_masks[MAX_NUMNODES]; static int num_node_masks; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA struct mdesc_mlgroup { u64 node; @@ -1059,7 +1059,7 @@ static void __init allocate_node_data(int nid) { struct pglist_data *p; unsigned long start_pfn, end_pfn; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid); @@ -1080,7 +1080,7 @@ static void __init allocate_node_data(int nid) static void init_node_masks_nonnuma(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; #endif @@ -1090,7 +1090,7 @@ static void init_node_masks_nonnuma(void) node_masks[0].match = 0; num_node_masks = 1; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA for (i = 0; i < NR_CPUS; i++) numa_cpu_lookup_table[i] = 0; @@ -1098,7 +1098,7 @@ static void init_node_masks_nonnuma(void) #endif } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); @@ -2487,7 +2487,7 @@ int page_in_phys_avail(unsigned long paddr) static void __init register_page_bootmem_info(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; for_each_online_node(i) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..5d523ff70fe7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1597,7 +1597,7 @@ config NODES_SHIFT default "10" if MAXSMP default "6" if X86_64 default "3" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0941d2f44f2a..78a32b956e81 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset); */ static bool __init pcpu_need_numa(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA pg_data_t *last = NULL; unsigned int cpu; @@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; @@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size) static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 21ffb03f6c72..74b78840182d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -651,7 +651,7 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -677,7 +677,7 @@ void __init initmem_init(void) setup_bootmem_allocator(); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void __init setup_bootmem_allocator(void) { diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 5aa8705df87e..4dbe715be65b 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -45,7 +45,7 @@ #endif #ifndef cpumask_of_node - #ifdef CONFIG_NEED_MULTIPLE_NODES + #ifdef CONFIG_NUMA #define cpumask_of_node(node) ((node) == 0 ? cpu_online_mask : cpu_none_mask) #else #define cpumask_of_node(node) ((void)(node), cpu_online_mask) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5984fff3f175..552309342c38 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int nid; #endif }; @@ -347,7 +347,7 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -366,7 +366,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9bd21e6fad6a..07922ee1477e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,7 +46,7 @@ extern int sysctl_page_lock_unfairness; void init_mm_internals(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ +#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) @@ -2460,7 +2460,7 @@ extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6f9829562af2..4bd420ed3961 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1043,17 +1043,17 @@ extern int percpu_pagelist_high_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map -#else /* CONFIG_NEED_MULTIPLE_NODES */ +#else /* CONFIG_NUMA */ #include -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 684a6061a13a..0a4780c047c9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); VMCOREINFO_SYMBOL(contig_page_data); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 218b96ccc84a..bffe4bd859f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -59,15 +59,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -# -# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's -# to represent different areas of memory. This variable allows -# those dependencies to exist individually. -# -config NEED_MULTIPLE_NODES - def_bool y - depends on NUMA - # # SPARSEMEM_EXTREME (which is the default) does some bootmem # allocations when sparse_init() is called. If this cannot diff --git a/mm/memblock.c b/mm/memblock.c index afaefa8fc6ab..123feef5259d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -92,7 +92,7 @@ * system initialization completes. */ -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); #endif @@ -607,7 +607,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1205,7 +1205,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int start_rgn, end_rgn; int i, ret; @@ -1849,7 +1849,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memory.c b/mm/memory.c index 3dd6b2e73e1d..48c4576df898 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -90,8 +90,7 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES -/* use the per-pgdat data instead for discontigmem - mbligh */ +#ifndef CONFIG_NUMA unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8926f3fd3bcf..c4069f9e3968 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1634,7 +1634,7 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * During memory init memblocks map pfns to nids. The search is expensive and @@ -1684,7 +1684,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -7438,7 +7438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA /* * With no DISCONTIG, the global mem_map is just set as node 0's */ diff --git a/mm/sparse.c b/mm/sparse.c index 55c18aff3e42..7272f7a1449d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -346,7 +346,7 @@ size_t mem_section_usage_size(void) static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA return __pa_symbol(pgdat); #else return __pa(pgdat); -- cgit v1.2.3 From 43b02ba93b25b1caff7a3457fc5d005485e78da5 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:05 -0700 Subject: mm: replace CONFIG_FLAT_NODE_MEM_MAP with CONFIG_FLATMEM After removal of the DISCONTIGMEM memory model the FLAT_NODE_MEM_MAP configuration option is equivalent to FLATMEM. Drop CONFIG_FLAT_NODE_MEM_MAP and use CONFIG_FLATMEM instead. Link: https://lkml.kernel.org/r/20210608091316.3622-10-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- kernel/crash_core.c | 2 +- mm/Kconfig | 4 ---- mm/page_alloc.c | 6 +++--- mm/page_ext.c | 2 +- 5 files changed, 7 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4bd420ed3961..578588d4afc9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -788,7 +788,7 @@ typedef struct pglist_data { struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ -#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ +#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; @@ -878,7 +878,7 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) #else #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 0a4780c047c9..da449c1cdca7 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -484,7 +484,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM VMCOREINFO_OFFSET(pglist_data, node_mem_map); #endif VMCOREINFO_OFFSET(pglist_data, node_start_pfn); diff --git a/mm/Kconfig b/mm/Kconfig index bffe4bd859f3..ded98fb859ab 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -55,10 +55,6 @@ config FLATMEM def_bool y depends on !SPARSEMEM || FLATMEM_MANUAL -config FLAT_NODE_MEM_MAP - def_bool y - depends on !SPARSEMEM - # # SPARSEMEM_EXTREME (which is the default) does some bootmem # allocations when sparse_init() is called. If this cannot diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4069f9e3968..0e441f1677f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6547,7 +6547,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -7403,7 +7403,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } } -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -7451,7 +7451,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) } #else static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) diff --git a/mm/page_ext.c b/mm/page_ext.c index df6f74aac8e1..293b2685fc48 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -191,7 +191,7 @@ fail: panic("Out of memory"); } -#else /* CONFIG_FLAT_NODE_MEM_MAP */ +#else /* CONFIG_FLATMEM */ struct page_ext *lookup_page_ext(const struct page *page) { -- cgit v1.2.3 From 44042b4498728f4376e84bae1ac8016d146d850b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:43:08 -0700 Subject: mm/page_alloc: allow high-order pages to be stored on the per-cpu lists The per-cpu page allocator (PCP) only stores order-0 pages. This means that all THP and "cheap" high-order allocations including SLUB contends on the zone->lock. This patch extends the PCP allocator to store THP and "cheap" high-order pages. Note that struct per_cpu_pages increases in size to 256 bytes (4 cache lines) on x86-64. Note that this is not necessarily a universal performance win because of how it is implemented. High-order pages can cause pcp->high to be exceeded prematurely for lower-orders so for example, a large number of THP pages being freed could release order-0 pages from the PCP lists. Hence, much depends on the allocation/free pattern as observed by a single CPU to determine if caching helps or hurts a particular workload. That said, basic performance testing passed. The following is a netperf UDP_STREAM test which hits the relevant patches as some of the network allocations are high-order. netperf-udp 5.13.0-rc2 5.13.0-rc2 mm-pcpburst-v3r4 mm-pcphighorder-v1r7 Hmean send-64 261.46 ( 0.00%) 266.30 * 1.85%* Hmean send-128 516.35 ( 0.00%) 536.78 * 3.96%* Hmean send-256 1014.13 ( 0.00%) 1034.63 * 2.02%* Hmean send-1024 3907.65 ( 0.00%) 4046.11 * 3.54%* Hmean send-2048 7492.93 ( 0.00%) 7754.85 * 3.50%* Hmean send-3312 11410.04 ( 0.00%) 11772.32 * 3.18%* Hmean send-4096 13521.95 ( 0.00%) 13912.34 * 2.89%* Hmean send-8192 21660.50 ( 0.00%) 22730.72 * 4.94%* Hmean send-16384 31902.32 ( 0.00%) 32637.50 * 2.30%* Functionally, a patch like this is necessary to make bulk allocation of high-order pages work with similar performance to order-0 bulk allocations. The bulk allocator is not updated in this series as it would have to be determined by bulk allocation users how they want to track the order of pages allocated with the bulk allocator. Link: https://lkml.kernel.org/r/20210611135753.GC30378@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Dave Hansen Cc: Michal Hocko Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 20 +++++- mm/internal.h | 2 +- mm/page_alloc.c | 169 +++++++++++++++++++++++++++++++++++-------------- mm/swap.c | 2 +- 4 files changed, 144 insertions(+), 49 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 578588d4afc9..265a32e1ff74 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -333,6 +333,24 @@ enum zone_watermarks { NR_WMARK }; +/* + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional + * for pageblock size for THP if configured. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define NR_PCP_THP 1 +#else +#define NR_PCP_THP 0 +#endif +#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) + +/* + * Shift to encode migratetype and order in the same integer, with order + * in the least significant bits. + */ +#define NR_PCP_ORDER_WIDTH 8 +#define NR_PCP_ORDER_MASK ((1<_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) @@ -349,7 +367,7 @@ struct per_cpu_pages { #endif /* Lists of pages, one per migrate type stored on the pcp-lists */ - struct list_head lists[MIGRATE_PCPTYPES]; + struct list_head lists[NR_PCP_LISTS]; }; struct per_cpu_zonestat { diff --git a/mm/internal.h b/mm/internal.h index 18e5fb4d225f..6ec2cea9926b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -203,7 +203,7 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page); +extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); extern void zone_pcp_update(struct zone *zone, int cpu_online); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e441f1677f3..34f097ecfe08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -674,10 +674,53 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline unsigned int order_to_pindex(int migratetype, int order) +{ + int base = order; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(order != pageblock_order); + base = PAGE_ALLOC_COSTLY_ORDER + 1; + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return (MIGRATE_PCPTYPES * base) + migratetype; +} + +static inline int pindex_to_order(unsigned int pindex) +{ + int order = pindex / MIGRATE_PCPTYPES; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + order = pageblock_order; + VM_BUG_ON(order != pageblock_order); + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return order; +} + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + static inline void free_the_page(struct page *page, unsigned int order) { - if (order == 0) /* Via pcp? */ - free_unref_page(page); + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); else __free_pages_ok(page, order, FPI_NONE); } @@ -700,7 +743,7 @@ static inline void free_the_page(struct page *page, unsigned int order) void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page), FPI_NONE); + free_the_page(page, compound_order(page)); } void prep_compound_page(struct page *page, unsigned int order) @@ -1350,9 +1393,9 @@ static __always_inline bool free_pages_prepare(struct page *page, * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when * moved from pcp lists to free lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { - return free_pages_prepare(page, 0, true, FPI_NONE); + return free_pages_prepare(page, order, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1369,12 +1412,12 @@ static bool bulkfree_pcp_prepare(struct page *page) * debug_pagealloc enabled, they are checked also immediately when being freed * to the pcp lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true, FPI_NONE); + return free_pages_prepare(page, order, true, FPI_NONE); else - return free_pages_prepare(page, 0, false, FPI_NONE); + return free_pages_prepare(page, order, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1406,8 +1449,10 @@ static inline void prefetch_buddy(struct page *page) static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { - int migratetype = 0; + int pindex = 0; int batch_free = 0; + int nr_freed = 0; + unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; @@ -1418,7 +1463,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, * below while (list_empty(list)) loop. */ count = min(pcp->count, count); - while (count) { + while (count > 0) { struct list_head *list; /* @@ -1430,24 +1475,31 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ do { batch_free++; - if (++migratetype == MIGRATE_PCPTYPES) - migratetype = 0; - list = &pcp->lists[migratetype]; + if (++pindex == NR_PCP_LISTS) + pindex = 0; + list = &pcp->lists[pindex]; } while (list_empty(list)); /* This is the only non-empty list. Free them all. */ - if (batch_free == MIGRATE_PCPTYPES) + if (batch_free == NR_PCP_LISTS) batch_free = count; + order = pindex_to_order(pindex); + BUILD_BUG_ON(MAX_ORDER >= (1<lru); - pcp->count--; + nr_freed += 1 << order; + count -= 1 << order; if (bulkfree_pcp_prepare(page)) continue; + /* Encode order with the migratetype */ + page->index <<= NR_PCP_ORDER_WIDTH; + page->index |= order; + list_add_tail(&page->lru, &head); /* @@ -1463,8 +1515,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page); prefetch_nr--; } - } while (--count && --batch_free && !list_empty(list)); + } while (count > 0 && --batch_free && !list_empty(list)); } + pcp->count -= nr_freed; /* * local_lock_irq held so equivalent to spin_lock_irqsave for @@ -1479,14 +1532,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ list_for_each_entry_safe(page, tmp, &head, lru) { int mt = get_pcppage_migratetype(page); + + /* mt has been encoded with the order (see above) */ + order = mt & NR_PCP_ORDER_MASK; + mt >>= NR_PCP_ORDER_WIDTH; + /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); } spin_unlock(&zone->lock); } @@ -3263,11 +3321,12 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -static bool free_unref_page_prepare(struct page *page, unsigned long pfn) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn, + unsigned int order) { int migratetype; - if (!free_pcp_prepare(page)) + if (!free_pcp_prepare(page, order)) return false; migratetype = get_pfnblock_migratetype(page, pfn); @@ -3317,16 +3376,18 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) } static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype) + int migratetype, unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int high; + int pindex; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); - list_add(&page->lru, &pcp->lists[migratetype]); - pcp->count++; + pindex = order_to_pindex(migratetype, order); + list_add(&page->lru, &pcp->lists[pindex]); + pcp->count += 1 << order; high = nr_pcp_high(pcp, zone); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); @@ -3336,15 +3397,15 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, } /* - * Free a 0-order page + * Free a pcp page */ -void free_unref_page(struct page *page) +void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, order)) return; /* @@ -3357,14 +3418,14 @@ void free_unref_page(struct page *page) migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype); + free_unref_page_commit(page, pfn, migratetype, order); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3381,7 +3442,7 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, 0)) list_del(&page->lru); /* @@ -3413,7 +3474,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, 0); migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype); + free_unref_page_commit(page, pfn, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get @@ -3549,7 +3610,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, /* Remove page from the per-cpu list, caller must protect the list */ static inline -struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3558,16 +3620,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, do { if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - READ_ONCE(pcp->batch), list, + int batch = READ_ONCE(pcp->batch); + int alloced; + + /* + * Scale batch relative to order if batch implies + * free pages can be stored on the PCP. Batch can + * be 1 for small zones or for boot pagesets which + * should never store free pages as the pages may + * belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + alloced = rmqueue_bulk(zone, order, + batch, list, migratetype, alloc_flags); + + pcp->count += alloced << order; if (unlikely(list_empty(list))) return NULL; } page = list_first_entry(list, struct page, lru); list_del(&page->lru); - pcp->count--; + pcp->count -= 1 << order; } while (check_new_pcp(page)); return page; @@ -3575,8 +3651,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, gfp_t gfp_flags, - int migratetype, unsigned int alloc_flags) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3592,8 +3669,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, */ pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; - list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + list = &pcp->lists[order_to_pindex(migratetype, order)]; + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3614,15 +3691,15 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0)) { + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and * we need to skip it when CMA area isn't allowed. */ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { - page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, - migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype, alloc_flags); goto out; } } @@ -5201,7 +5278,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); - pcp_list = &pcp->lists[ac.migratetype]; + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { @@ -5211,7 +5288,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, continue; } - page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { /* Try and get at least one page */ @@ -6778,13 +6855,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { - int migratetype; + int pindex; memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); - for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) - INIT_LIST_HEAD(&pcp->lists[migratetype]); + for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) + INIT_LIST_HEAD(&pcp->lists[pindex]); /* * Set batch and high values safe for a boot pageset. A true percpu diff --git a/mm/swap.c b/mm/swap.c index 18cc9e63515b..6c11db780467 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -95,7 +95,7 @@ static void __put_single_page(struct page *page) { __page_cache_release(page); mem_cgroup_uncharge(page); - free_unref_page(page); + free_unref_page(page, 0); } static void __put_compound_page(struct page *page) -- cgit v1.2.3 From 203c06eef579c670b8eb3a24108b9837bf9b7737 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:43:11 -0700 Subject: mm/page_alloc: split pcp->high across all online CPUs for cpuless nodes Dave Hansen reported the following about Feng Tang's tests on a machine with persistent memory onlined as a DRAM-like device. Feng Tang tossed these on a "Cascade Lake" system with 96 threads and ~512G of persistent memory and 128G of DRAM. The PMEM is in "volatile use" mode and being managed via the buddy just like the normal RAM. The PMEM zones are big ones: present 65011712 = 248 G high 134595 = 525 M The PMEM nodes, of course, don't have any CPUs in them. With your series, the pcp->high value per-cpu is 69584 pages or about 270MB per CPU. Scaled up by the 96 CPU threads, that's ~26GB of worst-case memory in the pcps per zone, or roughly 10% of the size of the zone. This should not cause a problem as such although it could trigger reclaim due to pages being stored on per-cpu lists for CPUs remote to a node. It is not possible to treat cpuless nodes exactly the same as normal nodes but the worst-case scenario can be mitigated by splitting pcp->high across all online CPUs for cpuless memory nodes. Link: https://lkml.kernel.org/r/20210616110743.GK30378@techsingularity.net Suggested-by: Dave Hansen Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Cc: "Tang, Feng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34f097ecfe08..db00ee8d79d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6790,7 +6790,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU int high; - int nr_local_cpus; + int nr_split_cpus; unsigned long total_pages; if (!percpu_pagelist_high_fraction) { @@ -6813,10 +6813,14 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * Split the high value across all online CPUs local to the zone. Note * that early in boot that CPUs may not be online yet and that during * CPU hotplug that the cpumask is not yet updated when a CPU is being - * onlined. - */ - nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; - high = total_pages / nr_local_cpus; + * onlined. For memory nodes that have no CPUs, split pcp->high across + * all online CPUs to mitigate the risk that reclaim is triggered + * prematurely due to pages stored on pcp lists. + */ + nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; + if (!nr_split_cpus) + nr_split_cpus = num_online_cpus(); + high = total_pages / nr_split_cpus; /* * Ensure high is at least batch*4. The multiple is based on the -- cgit v1.2.3