summaryrefslogtreecommitdiff
path: root/mm/mm_init.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 10:28:11 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 10:28:11 -0700
commit6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch)
tree2c425707f78642625dbe2c824c7fded2021e3dc7 /mm/mm_init.c
parent6aeadf7896bff4ca230702daba8788455e6b866e (diff)
parentacc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff)
downloadlwn-6e17c6de3ddf3073741d9c91a796ee696914d8a0.tar.gz
lwn-6e17c6de3ddf3073741d9c91a796ee696914d8a0.zip
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton: - Yosry Ahmed brought back some cgroup v1 stats in OOM logs - Yosry has also eliminated cgroup's atomic rstat flushing - Nhat Pham adds the new cachestat() syscall. It provides userspace with the ability to query pagecache status - a similar concept to mincore() but more powerful and with improved usability - Mel Gorman provides more optimizations for compaction, reducing the prevalence of page rescanning - Lorenzo Stoakes has done some maintanance work on the get_user_pages() interface - Liam Howlett continues with cleanups and maintenance work to the maple tree code. Peng Zhang also does some work on maple tree - Johannes Weiner has done some cleanup work on the compaction code - David Hildenbrand has contributed additional selftests for get_user_pages() - Thomas Gleixner has contributed some maintenance and optimization work for the vmalloc code - Baolin Wang has provided some compaction cleanups, - SeongJae Park continues maintenance work on the DAMON code - Huang Ying has done some maintenance on the swap code's usage of device refcounting - Christoph Hellwig has some cleanups for the filemap/directio code - Ryan Roberts provides two patch series which yield some rationalization of the kernel's access to pte entries - use the provided APIs rather than open-coding accesses - Lorenzo Stoakes has some fixes to the interaction between pagecache and directio access to file mappings - John Hubbard has a series of fixes to the MM selftesting code - ZhangPeng continues the folio conversion campaign - Hugh Dickins has been working on the pagetable handling code, mainly with a view to reducing the load on the mmap_lock - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment from 128 to 8 - Domenico Cerasuolo has improved the zswap reclaim mechanism by reorganizing the LRU management - Matthew Wilcox provides some fixups to make gfs2 work better with the buffer_head code - Vishal Moola also has done some folio conversion work - Matthew Wilcox has removed the remnants of the pagevec code - their functionality is migrated over to struct folio_batch * tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits) mm/hugetlb: remove hugetlb_set_page_subpool() mm: nommu: correct the range of mmap_sem_read_lock in task_mem() hugetlb: revert use of page_cache_next_miss() Revert "page cache: fix page_cache_next/prev_miss off by one" mm/vmscan: fix root proactive reclaim unthrottling unbalanced node mm: memcg: rename and document global_reclaim() mm: kill [add|del]_page_to_lru_list() mm: compaction: convert to use a folio in isolate_migratepages_block() mm: zswap: fix double invalidate with exclusive loads mm: remove unnecessary pagevec includes mm: remove references to pagevec mm: rename invalidate_mapping_pagevec to mapping_try_invalidate mm: remove struct pagevec net: convert sunrpc from pagevec to folio_batch i915: convert i915_gpu_error to use a folio_batch pagevec: rename fbatch_count() mm: remove check_move_unevictable_pages() drm: convert drm_gem_put_pages() to use a folio_batch i915: convert shmem_sg_free_table() to use a folio_batch scatterlist: add sg_set_folio() ...
Diffstat (limited to 'mm/mm_init.c')
-rw-r--r--mm/mm_init.c154
1 files changed, 102 insertions, 52 deletions
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1cfc08e25f93..a1963c3322af 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core,
return 0;
}
+bool mirrored_kernelcore __initdata_memblock;
+
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
@@ -644,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
}
/* Returns true if the struct page for the pfn is initialised */
-static inline bool __meminit early_page_initialised(unsigned long pfn)
+static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
{
- int nid = early_pfn_to_nid(pfn);
-
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
return false;
@@ -693,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn, int nid)
{
pg_data_t *pgdat;
- int nid, zid;
+ int zid;
- if (early_page_initialised(pfn))
+ if (early_page_initialised(pfn, nid))
return;
- nid = early_pfn_to_nid(pfn);
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
@@ -715,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-static inline bool early_page_initialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn, int nid)
{
return true;
}
@@ -725,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn)
+static inline void init_reserved_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -736,7 +735,8 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+void __meminit reserve_bootmem_region(phys_addr_t start,
+ phys_addr_t end, int nid)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -745,7 +745,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
- init_reserved_page(start_pfn);
+ init_reserved_page(start_pfn, nid);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
@@ -1166,24 +1166,15 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
+ unsigned long zone_start_pfn,
+ unsigned long zone_end_pfn)
{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- unsigned long zone_start_pfn, zone_end_pfn;
unsigned long nr_absent;
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
+ /* zone is empty, we don't have any absent pages */
+ if (zone_start_pfn == zone_end_pfn)
return 0;
- zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
/*
@@ -1227,9 +1218,6 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
@@ -1250,6 +1238,24 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
return *zone_end_pfn - *zone_start_pfn;
}
+static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
+ z->zone_start_pfn = 0;
+ z->spanned_pages = 0;
+ z->present_pages = 0;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ z->present_early_pages = 0;
+#endif
+ }
+
+ pgdat->node_spanned_pages = 0;
+ pgdat->node_present_pages = 0;
+ pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
+}
+
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
@@ -1261,7 +1267,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long spanned, absent;
- unsigned long size, real_size;
+ unsigned long real_size;
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
@@ -1269,23 +1275,22 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
&zone_start_pfn,
&zone_end_pfn);
absent = zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn);
+ zone_start_pfn,
+ zone_end_pfn);
- size = spanned;
- real_size = size - absent;
+ real_size = spanned - absent;
- if (size)
+ if (spanned)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
- zone->spanned_pages = size;
+ zone->spanned_pages = spanned;
zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
zone->present_early_pages = real_size;
#endif
- totalpages += size;
+ totalpages += spanned;
realtotalpages += real_size;
}
@@ -1506,6 +1511,8 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
pgdat->kswapd_order = 0;
pgdat->kswapd_highest_zoneidx = 0;
pgdat->node_start_pfn = 0;
+ pgdat->node_present_pages = 0;
+
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -1513,8 +1520,17 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
memset(p, 0, sizeof(*p));
}
- for (z = 0; z < MAX_NR_ZONES; z++)
- zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages and managed_pages because they will
+ * be updated in online_pages() and offline_pages().
+ */
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ zone->present_pages = 0;
+ zone_init_internals(zone, z, nid, 0);
+ }
}
#endif
@@ -1582,7 +1598,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
if (!size)
continue;
- set_pageblock_order();
setup_usemap(zone);
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
}
@@ -1706,11 +1721,13 @@ static void __init free_area_init_node(int nid)
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
} else {
pr_info("Initmem setup node %d as memoryless\n", nid);
- }
- calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+ reset_memoryless_node_totalpages(pgdat);
+ }
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
@@ -1720,7 +1737,7 @@ static void __init free_area_init_node(int nid)
}
/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat, int nid)
+static void check_for_memory(pg_data_t *pgdat)
{
enum zone_type zone_type;
@@ -1728,9 +1745,9 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
if (IS_ENABLED(CONFIG_HIGHMEM))
- node_set_state(nid, N_HIGH_MEMORY);
+ node_set_state(pgdat->node_id, N_HIGH_MEMORY);
if (zone_type <= ZONE_NORMAL)
- node_set_state(nid, N_NORMAL_MEMORY);
+ node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
break;
}
}
@@ -1749,11 +1766,6 @@ void __init setup_nr_node_ids(void)
}
#endif
-static void __init free_area_init_memoryless_node(int nid)
-{
- free_area_init_node(nid);
-}
-
/*
* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
* such cases we allow max_zone_pfn sorted in the descending order
@@ -1852,6 +1864,8 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
+ set_pageblock_order();
+
for_each_node(nid) {
pg_data_t *pgdat;
@@ -1864,7 +1878,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
panic("Cannot allocate %zuB for node %d.\n",
sizeof(*pgdat), nid);
arch_refresh_nodedata(nid, pgdat);
- free_area_init_memoryless_node(nid);
+ free_area_init_node(nid);
/*
* We do not want to confuse userspace by sysfs
@@ -1885,7 +1899,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat, nid);
+ check_for_memory(pgdat);
}
memmap_init();
@@ -2335,6 +2349,28 @@ void __init init_cma_reserved_pageblock(struct page *page)
}
#endif
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ cond_resched();
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
void __init page_alloc_init_late(void)
{
struct zone *zone;
@@ -2375,6 +2411,8 @@ void __init page_alloc_init_late(void)
/* Initialize page ext after all struct pages are initialized. */
if (deferred_struct_pages)
page_ext_init();
+
+ page_alloc_sysctl_init();
}
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
@@ -2539,8 +2577,14 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (!early_page_initialised(pfn))
- return;
+
+ if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
+ int nid = early_pfn_to_nid(pfn);
+
+ if (!early_page_initialised(pfn, nid))
+ return;
+ }
+
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
return;
@@ -2548,6 +2592,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
__free_pages_core(page, order);
}
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
+EXPORT_SYMBOL(init_on_alloc);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
+EXPORT_SYMBOL(init_on_free);
+
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)