summaryrefslogtreecommitdiff
path: root/include/linux/mmzone.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mmzone.h')
-rw-r--r--include/linux/mmzone.h433
1 files changed, 356 insertions, 77 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..9adb2ad21da5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -23,6 +23,7 @@
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
+#include <linux/sizes.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
@@ -37,6 +38,22 @@
#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
+/* Defines the order for the number of pages that have a migrate type. */
+#ifndef CONFIG_PAGE_BLOCK_MAX_ORDER
+#define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER
+#else
+#define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER
+#endif /* CONFIG_PAGE_BLOCK_MAX_ORDER */
+
+/*
+ * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated
+ * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER,
+ * which defines the order for the number of pages that can have a migrate type
+ */
+#if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER)
+#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER
+#endif
+
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should
@@ -45,6 +62,59 @@
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
+#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
+/*
+ * We don't expect any folios that exceed buddy sizes (and consequently
+ * memory sections).
+ */
+#define MAX_FOLIO_ORDER MAX_PAGE_ORDER
+#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/*
+ * Only pages within a single memory section are guaranteed to be
+ * contiguous. By limiting folios to a single memory section, all folio
+ * pages are guaranteed to be contiguous.
+ */
+#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
+#elif defined(CONFIG_HUGETLB_PAGE)
+/*
+ * There is no real limit on the folio size. We limit them to the maximum we
+ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
+ * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
+ */
+#ifdef CONFIG_64BIT
+#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT)
+#else
+#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT)
+#endif
+#else
+/*
+ * Without hugetlb, gigantic folios that are bigger than a single PUD are
+ * currently impossible.
+ */
+#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT)
+#endif
+
+#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER)
+
+/*
+ * HugeTLB Vmemmap Optimization (HVO) requires struct pages of the head page to
+ * be naturally aligned with regard to the folio size.
+ *
+ * HVO which is only active if the size of struct page is a power of 2.
+ */
+#define MAX_FOLIO_VMEMMAP_ALIGN \
+ (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \
+ is_power_of_2(sizeof(struct page)) ? \
+ MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0)
+
+/*
+ * vmemmap optimization (like HVO) is only possible for page orders that fill
+ * two or more pages with struct pages.
+ */
+#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page)))
+#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1)
+#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0)
+
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
@@ -63,6 +133,9 @@ enum migratetype {
* __free_pageblock_cma() function.
*/
MIGRATE_CMA,
+ __MIGRATE_TYPE_END = MIGRATE_CMA,
+#else
+ __MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
@@ -76,8 +149,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES];
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
-# define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \
- get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
+/*
+ * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio,
+ * so folio_pfn() cannot be used and pfn is needed.
+ */
+# define is_migrate_cma_folio(folio, pfn) \
+ (get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA)
#else
# define is_migrate_cma(migratetype) false
# define is_migrate_cma_page(_page) false
@@ -106,14 +183,12 @@ static inline bool migratetype_is_mergeable(int mt)
extern int page_group_by_mobility_disabled;
-#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)
+#define get_pageblock_migratetype(page) \
+ get_pfnblock_migratetype(page, page_to_pfn(page))
-#define get_pageblock_migratetype(page) \
- get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)
+#define folio_migratetype(folio) \
+ get_pageblock_migratetype(&folio->page)
-#define folio_migratetype(folio) \
- get_pfnblock_flags_mask(&folio->page, folio_pfn(folio), \
- MIGRATETYPE_MASK)
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
@@ -138,6 +213,7 @@ enum numa_stat_item {
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
+ NR_FREE_PAGES_BLOCKS,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_ACTIVE_ANON,
@@ -147,7 +223,6 @@ enum zone_stat_item {
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
/* Second 128 byte cacheline */
- NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
@@ -185,7 +260,6 @@ enum node_stat_item {
NR_FILE_PAGES,
NR_FILE_DIRTY,
NR_WRITEBACK,
- NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
@@ -200,6 +274,7 @@ enum node_stat_item {
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
+ NR_VMALLOC,
NR_KERNEL_STACK_KB, /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
NR_KERNEL_SCS_KB, /* measured in KiB */
@@ -214,15 +289,47 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
- PGPROMOTE_CANDIDATE, /* candidate pages to promote */
+ /**
+ * Candidate pages for promotion based on hint fault latency. This
+ * counter is used to control the promotion rate and adjust the hot
+ * threshold.
+ */
+ PGPROMOTE_CANDIDATE,
+ /**
+ * Not rate-limited (NRL) candidate pages for those can be promoted
+ * without considering hot threshold because of enough free pages in
+ * fast-tier node. These promotions bypass the regular hotness checks
+ * and do NOT influence the promotion rate-limiter or
+ * threshold-adjustment logic.
+ * This is for statistics/monitoring purposes.
+ */
+ PGPROMOTE_CANDIDATE_NRL,
#endif
/* PGDEMOTE_*: pages demoted */
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+ PGDEMOTE_PROACTIVE,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGSTEAL_KHUGEPAGED,
+ PGSTEAL_PROACTIVE,
+ PGSTEAL_ANON,
+ PGSTEAL_FILE,
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
+ PGSCAN_KHUGEPAGED,
+ PGSCAN_PROACTIVE,
+ PGSCAN_ANON,
+ PGSCAN_FILE,
+ PGREFILL,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_BALLOON_PAGES,
+ NR_KERNEL_FILE_PAGES,
+ NR_GPU_ACTIVE, /* Pages assigned to GPU objects */
+ NR_GPU_RECLAIM, /* Pages in shrinkable GPU pools */
NR_VM_NODE_STAT_ITEMS
};
@@ -579,7 +686,7 @@ struct lru_gen_memcg {
void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
-bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr);
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -587,6 +694,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid);
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid);
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
#else /* !CONFIG_LRU_GEN */
@@ -598,7 +708,8 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
-static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw,
+ unsigned int nr)
{
return false;
}
@@ -627,6 +738,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}
+static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+}
+
+static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+ return true;
+}
+
+static inline
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
struct lruvec {
@@ -964,6 +1089,9 @@ struct zone {
#ifdef CONFIG_UNACCEPTED_MEMORY
/* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
struct list_head unaccepted_pages;
+
+ /* To be called once the last page in the zone is accepted */
+ struct work_struct unaccepted_cleanup;
#endif
/* zone flags, see below */
@@ -972,6 +1100,9 @@ struct zone {
/* Primarily protects free_area */
spinlock_t lock;
+ /* Pages to be freed when next trylock succeeds */
+ struct llist_head trylock_free_pages;
+
/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);
@@ -1014,13 +1145,12 @@ struct zone {
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ struct page *vmemmap_tails[NR_VMEMMAP_TAILS];
+#endif
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
- PGDAT_DIRTY, /* reclaim scanning has recently found
- * many dirty file pages at the tail
- * of the LRU.
- */
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
@@ -1061,7 +1191,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z)
return wmark_pages(z, WMARK_PROMO);
}
-static inline unsigned long zone_managed_pages(struct zone *zone)
+static inline unsigned long zone_managed_pages(const struct zone *zone)
{
return (unsigned long)atomic_long_read(&zone->managed_pages);
}
@@ -1085,12 +1215,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}
-static inline bool zone_is_initialized(struct zone *zone)
+static inline bool zone_is_initialized(const struct zone *zone)
{
return zone->initialized;
}
-static inline bool zone_is_empty(struct zone *zone)
+static inline bool zone_is_empty(const struct zone *zone)
{
return zone->spanned_pages == 0;
}
@@ -1141,21 +1271,32 @@ static inline bool zone_is_empty(struct zone *zone)
#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
+static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
+{
+ ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
+ return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
+}
+
static inline enum zone_type page_zonenum(const struct page *page)
{
- ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
- return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+ return memdesc_zonenum(page->flags);
}
static inline enum zone_type folio_zonenum(const struct folio *folio)
{
- return page_zonenum(&folio->page);
+ return memdesc_zonenum(folio->flags);
}
#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
+{
+ return memdesc_zonenum(mdf) == ZONE_DEVICE;
+}
+
+static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
- return page_zonenum(page) == ZONE_DEVICE;
+ VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page);
+ return page_folio(page)->pgmap;
}
/*
@@ -1169,17 +1310,17 @@ static inline bool is_zone_device_page(const struct page *page)
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
const struct page *b)
{
- if (is_zone_device_page(a) != is_zone_device_page(b))
+ if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags))
return false;
- if (!is_zone_device_page(a))
+ if (!memdesc_is_zone_device(a->flags))
return true;
- return a->pgmap == b->pgmap;
+ return page_pgmap(a) == page_pgmap(b);
}
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
-static inline bool is_zone_device_page(const struct page *page)
+static inline bool memdesc_is_zone_device(memdesc_flags_t mdf)
{
return false;
}
@@ -1188,11 +1329,20 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
{
return true;
}
+static inline struct dev_pagemap *page_pgmap(const struct page *page)
+{
+ return NULL;
+}
#endif
+static inline bool is_zone_device_page(const struct page *page)
+{
+ return memdesc_is_zone_device(page->flags);
+}
+
static inline bool folio_is_zone_device(const struct folio *folio)
{
- return is_zone_device_page(&folio->page);
+ return memdesc_is_zone_device(folio->flags);
}
static inline bool is_zone_movable_page(const struct page *page)
@@ -1210,7 +1360,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio)
* Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
* intersection with the given zone
*/
-static inline bool zone_intersects(struct zone *zone,
+static inline bool zone_intersects(const struct zone *zone,
unsigned long start_pfn, unsigned long nr_pages)
{
if (zone_is_empty(zone))
@@ -1377,7 +1527,7 @@ typedef struct pglist_data {
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
- int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+ atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
@@ -1475,16 +1625,27 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
- enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags,
long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx);
+
+enum kswapd_clear_hopeless_reason {
+ KSWAPD_CLEAR_HOPELESS_OTHER = 0,
+ KSWAPD_CLEAR_HOPELESS_KSWAPD,
+ KSWAPD_CLEAR_HOPELESS_DIRECT,
+ KSWAPD_CLEAR_HOPELESS_PCP,
+};
+
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
+ enum zone_type highest_zoneidx);
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+ unsigned int order, int highest_zoneidx);
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason);
+bool kswapd_test_hopeless(pg_data_t *pgdat);
+
/*
* Memory initialization context, use to differentiate memory added by
* the platform statically or via memory hotplug interface.
@@ -1520,12 +1681,12 @@ static inline int local_memory_node(int node_id) { return node_id; };
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
#ifdef CONFIG_ZONE_DEVICE
-static inline bool zone_is_zone_device(struct zone *zone)
+static inline bool zone_is_zone_device(const struct zone *zone)
{
return zone_idx(zone) == ZONE_DEVICE;
}
#else
-static inline bool zone_is_zone_device(struct zone *zone)
+static inline bool zone_is_zone_device(const struct zone *zone)
{
return false;
}
@@ -1537,19 +1698,19 @@ static inline bool zone_is_zone_device(struct zone *zone)
* populated_zone(). If the whole zone is reserved then we can easily
* end up with populated_zone() && !managed_zone().
*/
-static inline bool managed_zone(struct zone *zone)
+static inline bool managed_zone(const struct zone *zone)
{
return zone_managed_pages(zone);
}
/* Returns true if a zone has memory */
-static inline bool populated_zone(struct zone *zone)
+static inline bool populated_zone(const struct zone *zone)
{
return zone->present_pages;
}
#ifdef CONFIG_NUMA
-static inline int zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(const struct zone *zone)
{
return zone->node;
}
@@ -1559,7 +1720,7 @@ static inline void zone_set_nid(struct zone *zone, int nid)
zone->node = nid;
}
#else
-static inline int zone_to_nid(struct zone *zone)
+static inline int zone_to_nid(const struct zone *zone)
{
return 0;
}
@@ -1586,19 +1747,20 @@ static inline int is_highmem_idx(enum zone_type idx)
* @zone: pointer to struct zone variable
* Return: 1 for a highmem zone, 0 otherwise
*/
-static inline int is_highmem(struct zone *zone)
+static inline int is_highmem(const struct zone *zone)
{
return is_highmem_idx(zone_idx(zone));
}
-#ifdef CONFIG_ZONE_DMA
-bool has_managed_dma(void);
-#else
+bool has_managed_zone(enum zone_type zone);
static inline bool has_managed_dma(void)
{
+#ifdef CONFIG_ZONE_DMA
+ return has_managed_zone(ZONE_DMA);
+#else
return false;
-}
#endif
+}
#ifndef CONFIG_NUMA
@@ -1652,12 +1814,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref)
return zoneref->zone;
}
-static inline int zonelist_zone_idx(struct zoneref *zoneref)
+static inline int zonelist_zone_idx(const struct zoneref *zoneref)
{
return zoneref->zone_idx;
}
-static inline int zonelist_node_idx(struct zoneref *zoneref)
+static inline int zonelist_node_idx(const struct zoneref *zoneref)
{
return zone_to_nid(zoneref->zone);
}
@@ -1839,15 +2001,13 @@ struct mem_section_usage {
unsigned long pageblock_flags[0];
};
-void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
-
struct page;
struct page_ext;
struct mem_section {
/*
* This is, logically, a pointer to an array of struct
* pages. However, it is stored with some other magic.
- * (see sparse.c::sparse_init_one_section())
+ * (see sparse_init_one_section())
*
* Additionally during early boot we encode node id of
* the location of the section here to guide allocation.
@@ -1910,21 +2070,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
extern size_t mem_section_usage_size(void);
/*
- * We use the lower bits of the mem_map pointer to store
- * a little bit of information. The pointer is calculated
- * as mem_map - section_nr_to_pfn(pnum). The result is
- * aligned to the minimum alignment of the two values:
- * 1. All mem_map arrays are page-aligned.
- * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
- * lowest bits. PFN_SECTION_SHIFT is arch-specific
- * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
- * worst combination is powerpc with 256k pages,
- * which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available on all architectures.
- * However, we can exceed 6 bits on some other architectures except
- * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
- * with the worst case of 64K pages on arm64) if we make sure the
- * exceeded bit is not applicable to powerpc.
+ * We use the lower bits of the mem_map pointer to store a little bit of
+ * information. The pointer is calculated as mem_map - section_nr_to_pfn().
+ * The result is aligned to the minimum alignment of the two values:
+ *
+ * 1. All mem_map arrays are page-aligned.
+ * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits.
+ *
+ * We always expect a single section to cover full pages. Therefore,
+ * we can safely assume that PFN_SECTION_SHIFT is large enough to
+ * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this.
*/
enum {
SECTION_MARKED_PRESENT_BIT,
@@ -1934,6 +2089,9 @@ enum {
#ifdef CONFIG_ZONE_DEVICE
SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+ SECTION_IS_VMEMMAP_PREINIT_BIT,
+#endif
SECTION_MAP_LAST_BIT,
};
@@ -1944,6 +2102,9 @@ enum {
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+#define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT)
+#endif
#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT
@@ -1954,7 +2115,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
return (struct page *)map;
}
-static inline int present_section(struct mem_section *section)
+static inline int present_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}
@@ -1964,12 +2125,12 @@ static inline int present_section_nr(unsigned long nr)
return present_section(__nr_to_section(nr));
}
-static inline int valid_section(struct mem_section *section)
+static inline int valid_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}
-static inline int early_section(struct mem_section *section)
+static inline int early_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_EARLY));
}
@@ -1979,25 +2140,49 @@ static inline int valid_section_nr(unsigned long nr)
return valid_section(__nr_to_section(nr));
}
-static inline int online_section(struct mem_section *section)
+static inline int online_section(const struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}
#ifdef CONFIG_ZONE_DEVICE
-static inline int online_device_section(struct mem_section *section)
+static inline int online_device_section(const struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
return section && ((section->section_mem_map & flags) == flags);
}
#else
-static inline int online_device_section(struct mem_section *section)
+static inline int online_device_section(const struct mem_section *section)
{
return 0;
}
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+static inline int preinited_vmemmap_section(const struct mem_section *section)
+{
+ return (section &&
+ (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
+}
+
+void sparse_vmemmap_init_nid_early(int nid);
+void sparse_vmemmap_init_nid_late(int nid);
+
+#else
+static inline int preinited_vmemmap_section(const struct mem_section *section)
+{
+ return 0;
+}
+static inline void sparse_vmemmap_init_nid_early(int nid)
+{
+}
+
+static inline void sparse_vmemmap_init_nid_late(int nid)
+{
+}
+#endif
+
static inline int online_section_nr(unsigned long nr)
{
return online_section(__nr_to_section(nr));
@@ -2028,13 +2213,42 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
return usage ? test_bit(idx, usage->subsection_map) : 0;
}
+
+static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
+{
+ struct mem_section_usage *usage = READ_ONCE(ms->usage);
+ int idx = subsection_map_index(*pfn);
+ unsigned long bit;
+
+ if (!usage)
+ return false;
+
+ if (test_bit(idx, usage->subsection_map))
+ return true;
+
+ /* Find the next subsection that exists */
+ bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx);
+ if (bit == SUBSECTIONS_PER_SECTION)
+ return false;
+
+ *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION);
+ return true;
+}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
return 1;
}
+
+static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
+{
+ return true;
+}
#endif
+void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
+ unsigned long flags);
+
#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
* pfn_valid - check if there is a valid memory map entry for a PFN
@@ -2078,6 +2292,58 @@ static inline int pfn_valid(unsigned long pfn)
return ret;
}
+
+/* Returns end_pfn or higher if no valid PFN remaining in range */
+static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn)
+{
+ unsigned long nr = pfn_to_section_nr(pfn);
+
+ rcu_read_lock_sched();
+
+ while (nr <= __highest_present_section_nr && pfn < end_pfn) {
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ if (valid_section(ms) &&
+ (early_section(ms) || pfn_section_first_valid(ms, &pfn))) {
+ rcu_read_unlock_sched();
+ return pfn;
+ }
+
+ /* Nothing left in this section? Skip to next section */
+ nr++;
+ pfn = section_nr_to_pfn(nr);
+ }
+
+ rcu_read_unlock_sched();
+ return end_pfn;
+}
+
+static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn)
+{
+ pfn++;
+
+ if (pfn >= end_pfn)
+ return end_pfn;
+
+ /*
+ * Either every PFN within the section (or subsection for VMEMMAP) is
+ * valid, or none of them are. So there's no point repeating the check
+ * for every PFN; only call first_valid_pfn() again when crossing a
+ * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)).
+ */
+ if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ?
+ PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK))
+ return pfn;
+
+ return first_valid_pfn(pfn, end_pfn);
+}
+
+
+#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \
+ for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn)); \
+ (_pfn) < (_end_pfn); \
+ (_pfn) = next_valid_pfn((_pfn), (_end_pfn)))
+
#endif
static inline int pfn_in_present_section(unsigned long pfn)
@@ -2097,6 +2363,11 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
return -1;
}
+#define for_each_present_section_nr(start, section_nr) \
+ for (section_nr = next_present_section_nr(start - 1); \
+ section_nr != -1; \
+ section_nr = next_present_section_nr(section_nr))
+
/*
* These are _only_ used during initialisation, therefore they
* can use __initdata ... They could have names to indicate
@@ -2112,14 +2383,22 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
#define pfn_to_nid(pfn) (0)
#endif
-void sparse_init(void);
#else
-#define sparse_init() do {} while (0)
-#define sparse_index_init(_sec, _nid) do {} while (0)
+#define sparse_vmemmap_init_nid_early(_nid) do {} while (0)
+#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
#define pfn_in_present_section pfn_valid
-#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */
+/*
+ * Fallback case for when the architecture provides its own pfn_valid() but
+ * not a corresponding for_each_valid_pfn().
+ */
+#ifndef for_each_valid_pfn
+#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \
+ for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++) \
+ if (pfn_valid(_pfn))
+#endif
+
#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */