summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c1122
1 files changed, 744 insertions, 378 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 01eab25edf89..f51aa6051a99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -273,6 +276,7 @@ int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
+int defrag_mode;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -508,9 +512,9 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- bool __maybe_unused movable;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
@@ -614,6 +618,10 @@ compaction_capture(struct capture_control *capc, struct page *page,
capc->cc->migratetype != MIGRATE_MOVABLE)
return false;
+ if (migratetype != capc->cc->migratetype)
+ trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
+ capc->cc->migratetype, migratetype);
+
capc->page = page;
return true;
}
@@ -655,16 +663,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
bool tail)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
else
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
/*
@@ -676,24 +688,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), old_mt, 1 << order);
+ get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
- account_freepages(zone, -(1 << order), old_mt);
- account_freepages(zone, 1 << order, new_mt);
+ account_freepages(zone, -nr_pages, old_mt);
+ account_freepages(zone, nr_pages, new_mt);
+
+ if (order >= pageblock_order &&
+ is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
+ if (!is_migrate_isolate(old_mt))
+ nr_pages = -nr_pages;
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
+ }
}
static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
+ int nr_pages = 1 << order;
+
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
if (page_reported(page))
@@ -703,6 +725,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -947,21 +972,34 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
- if (unlikely(folio_entire_mapcount(folio))) {
- bad_page(page, "nonzero entire_mapcount");
- goto out;
- }
if (unlikely(folio_large_mapcount(folio))) {
bad_page(page, "nonzero large_mapcount");
goto out;
}
- if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
+ unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
- if (unlikely(atomic_read(&folio->_pincount))) {
- bad_page(page, "nonzero pincount");
- goto out;
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+ bad_page(page, "nonzero mm mapcount 0");
+ goto out;
+ }
+ if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+ bad_page(page, "nonzero mm mapcount 1");
+ goto out;
+ }
+ }
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
}
break;
case 2:
@@ -970,7 +1008,22 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "on deferred list");
goto out;
}
+ if (!IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
+ }
break;
+ case 3:
+ /* the third tail page: hugetlb specifics overlap ->mappings */
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
+ break;
+ fallthrough;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
@@ -1041,6 +1094,84 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_sub(page, nr);
+}
+
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+{
+ struct alloc_tag *tag;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = __pgalloc_tag_get(page);
+ if (tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
@@ -1096,8 +1227,12 @@ __always_inline bool free_pages_prepare(struct page *page,
if (unlikely(order)) {
int i;
- if (compound)
+ if (compound) {
page[1].flags &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
+ }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1249,13 +1384,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
} while (1);
}
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->order = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->order;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1295,12 +1461,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order,
set_page_count(p, 0);
}
- /*
- * Freeing the page with debug_pagealloc enabled will try to
- * unmap it; some archs don't like double-unmappings, so
- * map it first.
- */
- debug_pagealloc_map_pages(page, nr_pages);
adjust_managed_page_count(page, nr_pages);
} else {
for (loop = 0; loop < nr_pages; loop++, p++) {
@@ -1508,7 +1668,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
int i;
set_page_private(page, 0);
- set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
@@ -1832,39 +1991,6 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}
-/*
- * When we are falling back to another migratetype during allocation, try to
- * steal extra free pages from the same pageblocks to satisfy further
- * allocations, instead of polluting multiple pageblocks.
- *
- * If we are stealing a relatively large buddy page, it is likely there will
- * be more free pages in the pageblock, so try to steal them all. For
- * reclaimable and unmovable allocations, we steal regardless of page size,
- * as fragmentation caused by those allocations polluting movable pageblocks
- * is worse than movable allocations stealing from unmovable and reclaimable
- * pageblocks.
- */
-static bool can_steal_fallback(unsigned int order, int start_mt)
-{
- /*
- * Leaving this order check is intended, although there is
- * relaxed order check in next check. The reason is that
- * we can actually steal whole pageblock if this condition met,
- * but, below check doesn't guarantee it and that is just heuristic
- * so could be changed anytime.
- */
- if (order >= pageblock_order)
- return true;
-
- if (order >= pageblock_order / 2 ||
- start_mt == MIGRATE_RECLAIMABLE ||
- start_mt == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled)
- return true;
-
- return false;
-}
-
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
@@ -1903,30 +2029,99 @@ static inline bool boost_watermark(struct zone *zone)
}
/*
- * This function implements actual steal behaviour. If order is large enough, we
- * can claim the whole pageblock for the requested migratetype. If not, we check
- * the pageblock for constituent pages; if at least half of the pages are free
- * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * When we are falling back to another migratetype during allocation, should we
+ * try to claim an entire block to satisfy further allocations, instead of
+ * polluting multiple pageblocks?
*/
-static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
- int current_order, int order, int start_type,
- unsigned int alloc_flags, bool whole_block)
+static bool should_try_claim_block(unsigned int order, int start_mt)
{
- int free_pages, movable_pages, alike_pages;
- unsigned long start_pfn;
- int block_type;
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually claim the whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ /*
+ * Above a certain threshold, always try to claim, as it's likely there
+ * will be more free pages in the pageblock.
+ */
+ if (order >= pageblock_order / 2)
+ return true;
+
+ /*
+ * Unmovable/reclaimable allocations would cause permanent
+ * fragmentations if they fell back to allocating from a movable block
+ * (polluting it), so we try to claim the whole block regardless of the
+ * allocation size. Later movable allocations can always steal from this
+ * block, which is less problematic.
+ */
+ if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
+ return true;
- block_type = get_pageblock_migratetype(page);
+ if (page_group_by_mobility_disabled)
+ return true;
/*
- * This can happen due to races and we want to prevent broken
- * highatomic accounting.
+ * Movable pages won't cause permanent fragmentation, so when you alloc
+ * small pages, we just need to temporarily steal unmovable or
+ * reclaimable pages that are closest to the request size. After a
+ * while, memory compaction may occur to form large contiguous pages,
+ * and the next movable allocation may not need to steal.
*/
- if (is_migrate_highatomic(block_type))
- goto single_page;
+ return false;
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * Sets *claim_block to instruct the caller whether it should convert a whole
+ * pageblock to the returned migratetype.
+ * If only_claim is true, this function returns fallback_mt only if
+ * we would do this whole-block claiming. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_claim, bool *claim_block)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *claim_block = false;
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (free_area_empty(area, fallback_mt))
+ continue;
+
+ if (should_try_claim_block(order, migratetype))
+ *claim_block = true;
+
+ if (*claim_block || !only_claim)
+ return fallback_mt;
+ }
+
+ return -1;
+}
+
+/*
+ * This function implements actual block claiming behaviour. If order is large
+ * enough, we can claim the whole pageblock for the requested migratetype. If
+ * not, we check the pageblock for constituent pages; if at least half of the
+ * pages are free or compatible, we can still claim the whole block, so pages
+ * freed in the future will be put on the correct free list.
+ */
+static struct page *
+try_to_claim_block(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ int block_type, unsigned int alloc_flags)
+{
+ int free_pages, movable_pages, alike_pages;
+ unsigned long start_pfn;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1947,14 +2142,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
- /* We are not allowed to try stealing from the whole block */
- if (!whole_block)
- goto single_page;
-
/* moving whole block can fail due to zone boundary conditions */
if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
&movable_pages))
- goto single_page;
+ return NULL;
/*
* Determine how many pages are compatible with our allocation.
@@ -1987,198 +2178,24 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
return __rmqueue_smallest(zone, order, start_type);
}
-single_page:
- page_del_and_expand(zone, page, order, current_order, block_type);
- return page;
-}
-
-/*
- * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
- * fragmentation due to mixed migratetype pages in one pageblock.
- */
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal)
-{
- int i;
- int fallback_mt;
-
- if (area->nr_free == 0)
- return -1;
-
- *can_steal = false;
- for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
-
- if (can_steal_fallback(order, migratetype))
- *can_steal = true;
-
- if (!only_stealable)
- return fallback_mt;
-
- if (*can_steal)
- return fallback_mt;
- }
-
- return -1;
-}
-
-/*
- * Reserve the pageblock(s) surrounding an allocation request for
- * exclusive use of high-order atomic allocations if there are no
- * empty page blocks that contain a page with a suitable order
- */
-static void reserve_highatomic_pageblock(struct page *page, int order,
- struct zone *zone)
-{
- int mt;
- unsigned long max_managed, flags;
-
- /*
- * The number reserved as: minimum is 1 pageblock, maximum is
- * roughly 1% of a zone. But if 1% of a zone falls below a
- * pageblock size, then don't reserve any pageblocks.
- * Check is race-prone but harmless.
- */
- if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
- return;
- max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
- if (zone->nr_reserved_highatomic >= max_managed)
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- /* Recheck the nr_reserved_highatomic limit under the lock */
- if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
-
- /* Yoink! */
- mt = get_pageblock_migratetype(page);
- /* Only reserve normal pageblocks (i.e., they can merge with others) */
- if (!migratetype_is_mergeable(mt))
- goto out_unlock;
-
- if (order < pageblock_order) {
- if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
- zone->nr_reserved_highatomic += pageblock_nr_pages;
- } else {
- change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
- zone->nr_reserved_highatomic += 1 << order;
- }
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
+ return NULL;
}
/*
- * Used when an allocation is about to fail under memory pressure. This
- * potentially hurts the reliability of high-order allocations when under
- * intense memory pressure but failed atomic allocations should be easier
- * to recover from than an OOM.
+ * Try finding a free buddy page on the fallback list.
*
- * If @force is true, try to unreserve pageblocks even though highatomic
- * pageblock is exhausted.
- */
-static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
- bool force)
-{
- struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
- struct zoneref *z;
- struct zone *zone;
- struct page *page;
- int order;
- int ret;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
- ac->nodemask) {
- /*
- * Preserve at least one pageblock unless memory pressure
- * is really high.
- */
- if (!force && zone->nr_reserved_highatomic <=
- pageblock_nr_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct free_area *area = &(zone->free_area[order]);
- int mt;
-
- page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
- if (!page)
- continue;
-
- mt = get_pageblock_migratetype(page);
- /*
- * In page freeing path, migratetype change is racy so
- * we can counter several free pages in a pageblock
- * in this loop although we changed the pageblock type
- * from highatomic to ac->migratetype. So we should
- * adjust the count once.
- */
- if (is_migrate_highatomic(mt)) {
- unsigned long size;
- /*
- * It should never happen but changes to
- * locking could inadvertently allow a per-cpu
- * drain to add pages to MIGRATE_HIGHATOMIC
- * while unreserving so be safe and watch for
- * underflows.
- */
- size = max(pageblock_nr_pages, 1UL << order);
- size = min(size, zone->nr_reserved_highatomic);
- zone->nr_reserved_highatomic -= size;
- }
-
- /*
- * Convert to ac->migratetype and avoid the normal
- * pageblock stealing heuristics. Minimally, the caller
- * is doing the work and needs the pages. More
- * importantly, if the block was always converted to
- * MIGRATE_UNMOVABLE or another type then the number
- * of pageblocks that cannot be completely freed
- * may increase.
- */
- if (order < pageblock_order)
- ret = move_freepages_block(zone, page, mt,
- ac->migratetype);
- else {
- move_to_free_list(page, zone, order, mt,
- ac->migratetype);
- change_pageblock_range(page, order,
- ac->migratetype);
- ret = 1;
- }
- /*
- * Reserving the block(s) already succeeded,
- * so this should not fail on zone boundaries.
- */
- WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-
- return false;
-}
-
-/*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * This will attempt to claim a whole pageblock for the requested type
+ * to ensure grouping of such requests in the future.
+ *
+ * If a whole block cannot be claimed, steal an individual page, regressing to
+ * __rmqueue_smallest() logic to at least break up as little contiguity as
+ * possible.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
+ *
+ * Return the stolen page, or NULL if none can be found.
*/
static __always_inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
@@ -2189,7 +2206,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool can_steal;
+ bool claim_block;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2208,49 +2225,40 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, false, &claim_block);
if (fallback_mt == -1)
continue;
- /*
- * We cannot steal all free pages from the pageblock and the
- * requested migratetype is movable. In that case it's better to
- * steal and split the smallest available page instead of the
- * largest available page, because even if the next movable
- * allocation falls back into a different pageblock than this
- * one, it won't cause permanent fragmentation.
- */
- if (!can_steal && start_migratetype == MIGRATE_MOVABLE
- && current_order > order)
- goto find_smallest;
+ if (!claim_block)
+ break;
- goto do_steal;
+ page = get_page_from_free_area(area, fallback_mt);
+ page = try_to_claim_block(zone, page, current_order, order,
+ start_migratetype, fallback_mt,
+ alloc_flags);
+ if (page)
+ goto got_one;
}
- return NULL;
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ return NULL;
-find_smallest:
+ /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
- if (fallback_mt != -1)
- break;
- }
-
- /*
- * This should not happen - we already found a suitable fallback
- * when looking for the largest page.
- */
- VM_BUG_ON(current_order > MAX_PAGE_ORDER);
+ start_migratetype, false, &claim_block);
+ if (fallback_mt == -1)
+ continue;
-do_steal:
- page = get_page_from_free_area(area, fallback_mt);
+ page = get_page_from_free_area(area, fallback_mt);
+ page_del_and_expand(zone, page, order, current_order, fallback_mt);
+ goto got_one;
+ }
- /* take off list, maybe claim block, expand remainder */
- page = steal_suitable_fallback(zone, page, current_order, order,
- start_migratetype, alloc_flags, can_steal);
+ return NULL;
+got_one:
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -2306,7 +2314,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
@@ -2592,9 +2604,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
- struct page *page, int migratetype,
- unsigned int order)
+static void free_frozen_page_commit(struct zone *zone,
+ struct per_cpu_pages *pcp, struct page *page, int migratetype,
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@@ -2629,6 +2641,14 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return;
+ }
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2643,7 +2663,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
/*
* Free a pcp page
*/
-void free_unref_page(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2652,7 +2673,7 @@ void free_unref_page(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2666,27 +2687,37 @@ void free_unref_page(struct page *page, unsigned int order)
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
+ zone = page_zone(page);
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- zone = page_zone(page);
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
+}
+
/*
* Free a batch of folios
*/
@@ -2743,7 +2774,7 @@ void free_unref_folios(struct folio_batch *folios)
/*
* Free isolated pages directly to the
- * allocator, see comment in free_unref_page.
+ * allocator, see comment in free_frozen_pages.
*/
if (is_migrate_isolate(migratetype)) {
free_one_page(zone, &folio->page, pfn,
@@ -2774,8 +2805,8 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_unref_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
+ order, FPI_NONE);
}
if (pcp) {
@@ -2804,7 +2835,7 @@ void split_page(struct page *page, unsigned int order)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2906,7 +2937,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@@ -3094,6 +3129,142 @@ out:
return page;
}
+/*
+ * Reserve the pageblock(s) surrounding an allocation request for
+ * exclusive use of high-order atomic allocations if there are no
+ * empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, int order,
+ struct zone *zone)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * The number reserved as: minimum is 1 pageblock, maximum is
+ * roughly 1% of a zone. But if 1% of a zone falls below a
+ * pageblock size, then don't reserve any pageblocks.
+ * Check is race-prone but harmless.
+ */
+ if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+ return;
+ max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (!migratetype_is_mergeable(mt))
+ goto out_unlock;
+
+ if (order < pageblock_order) {
+ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
+ goto out_unlock;
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ } else {
+ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
+ zone->nr_reserved_highatomic += 1 << order;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve pageblocks even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+ int ret;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ ac->nodemask) {
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+ unsigned long size;
+
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ if (!page)
+ continue;
+
+ size = max(pageblock_nr_pages, 1UL << order);
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
+ size = zone->nr_reserved_highatomic;
+ zone->nr_reserved_highatomic -= size;
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ if (order < pageblock_order)
+ ret = move_freepages_block(zone, page,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ else {
+ move_to_free_list(page, zone, order,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ change_pageblock_range(page, order,
+ ac->migratetype);
+ ret = 1;
+ }
+ /*
+ * Reserving the block(s) already succeeded,
+ * so this should not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(ret == -1);
+ if (ret > 0) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return false;
+}
+
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
@@ -3297,6 +3468,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
+ if (defrag_mode) {
+ alloc_flags |= ALLOC_NOFRAGMENT;
+ return alloc_flags;
+ }
+
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
@@ -3388,7 +3564,7 @@ retry:
continue;
}
- if (no_fallback && nr_online_nodes > 1 &&
+ if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
zone != zonelist_zone(ac->preferred_zoneref)) {
int local_nid;
@@ -3499,7 +3675,7 @@ try_this_zone:
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
- if (no_fallback) {
+ if (no_fallback && !defrag_mode) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
@@ -3567,7 +3743,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);
-
return page;
}
@@ -3979,15 +4154,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
+ unsigned int reclaim_order;
+
+ if (defrag_mode)
+ reclaim_order = max(order, pageblock_order);
+ else
+ reclaim_order = order;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (!managed_zone(zone))
continue;
- if (last_pgdat != zone->zone_pgdat) {
- wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
- }
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
+ last_pgdat = zone->zone_pgdat;
}
}
@@ -4037,6 +4218,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+ if (defrag_mode)
+ alloc_flags |= ALLOC_NOFRAGMENT;
+
return alloc_flags;
}
@@ -4243,6 +4427,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -4418,6 +4603,11 @@ retry:
&compaction_retries))
goto retry;
+ /* Reclaim/compaction failed to prevent the fallback */
+ if (defrag_mode) {
+ alloc_flags &= ALLOC_NOFRAGMENT;
+ goto retry;
+ }
/*
* Deal with possible cpuset update races or zonelist updates to avoid
@@ -4511,7 +4701,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4531,28 +4726,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
* @gfp: GFP flags for the allocation
* @preferred_nid: The preferred NUMA node ID to allocate from
* @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list or array
- * @page_list: Optional list to store the allocated pages
- * @page_array: Optional array to store the pages
+ * @nr_pages: The number of pages desired in the array
+ * @page_array: Array to store the pages
*
* This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to page_list if page_list
- * is not NULL, otherwise it is assumed that the page_array is valid.
+ * allocate nr_pages quickly. Pages are added to the page_array.
*
- * For lists, nr_pages is the number of pages that should be allocated.
- *
- * For arrays, only NULL elements are populated with pages and nr_pages
+ * Note that only NULL elements are populated with pages and nr_pages
* is the maximum number of pages that will be stored in the array.
*
- * Returns the number of pages on the list or array.
+ * Returns the number of pages in the array.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array)
{
struct page *page;
@@ -4570,7 +4760,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ while (nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
/* No pages requested? */
@@ -4578,7 +4768,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto out;
/* Already populated array? */
- if (unlikely(page_array && nr_pages - nr_populated == 0))
+ if (unlikely(nr_pages - nr_populated == 0))
goto out;
/* Bulk allocator does not support memcg accounting. */
@@ -4660,7 +4850,7 @@ retry_this_zone:
while (nr_populated < nr_pages) {
/* Skip existing pages */
- if (page_array && page_array[nr_populated]) {
+ if (page_array[nr_populated]) {
nr_populated++;
continue;
}
@@ -4678,11 +4868,8 @@ retry_this_zone:
nr_account++;
prep_new_page(page, 0, gfp, 0);
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
+ set_page_refcounted(page);
+ page_array[nr_populated++] = page;
}
pcp_spin_unlock(pcp);
@@ -4699,14 +4886,8 @@ failed_irq:
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- if (page) {
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
- }
-
+ if (page)
+ page_array[nr_populated++] = page;
goto out;
}
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
@@ -4714,8 +4895,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
- int preferred_nid, nodemask_t *nodemask)
+struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4768,7 +4949,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
- __free_pages(page, order);
+ free_frozen_pages(page, order);
page = NULL;
}
@@ -4777,6 +4958,18 @@ out:
return page;
}
+EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
+
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
+{
+ struct page *page;
+
+ page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
EXPORT_SYMBOL(__alloc_pages_noprof);
struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
@@ -4811,9 +5004,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
- * __free_pages - Free pages allocated with alloc_pages().
+ * ___free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
+ * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -4830,22 +5024,36 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-void __free_pages(struct page *page, unsigned int order)
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
- struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
- free_unref_page(page, order);
+ __free_frozen_pages(page, order, fpi_flags);
else if (!head) {
- pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ pgalloc_tag_sub_pages(page, (1 << order) - 1);
while (order-- > 0)
- free_unref_page(page + (1 << order), order);
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
}
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_NONE);
+}
EXPORT_SYMBOL(__free_pages);
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from try_alloc_pages)
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_TRYLOCK);
+}
+
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -4866,7 +5074,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
while (page < --last)
set_page_refcounted(last);
@@ -5161,13 +5369,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
zonerefs->zone_idx = 0;
}
-/*
- * Build zonelists ordered by zone and nodes within zones.
- * This results in conserving DMA zone[s] until all Normal memory is
- * exhausted, but results in overflowing to remote node while memory
- * may still exist in local DMA zone.
- */
-
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
@@ -5836,6 +6037,7 @@ static void calculate_totalreserve_pages(void)
}
}
totalreserve_pages = reserve_pages;
+ trace_mm_calculate_totalreserve_pages(totalreserve_pages);
}
/*
@@ -5858,14 +6060,15 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
+ trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
+ zone->lowmem_reserve[j]);
}
}
}
@@ -5929,6 +6132,7 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
+ trace_mm_setup_per_zone_wmarks(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6175,7 +6379,7 @@ out:
return ret;
}
-static struct ctl_table page_alloc_sysctl_table[] = {
+static const struct ctl_table page_alloc_sysctl_table[] = {
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -6202,6 +6406,15 @@ static struct ctl_table page_alloc_sysctl_table[] = {
.extra2 = SYSCTL_THREE_THOUSAND,
},
{
+ .procname = "defrag_mode",
+ .data = &defrag_mode,
+ .maxlen = sizeof(defrag_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
@@ -6270,9 +6483,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
* @migratetype: using migratetype to filter the type of migration in
* trace_mm_alloc_contig_migrate_range_info.
*/
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end, int migratetype)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6281,7 +6493,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
int ret = 0;
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
struct page *page;
@@ -6351,7 +6563,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
return (ret < 0) ? ret : 0;
}
-static void split_free_pages(struct list_head *list)
+static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
{
int order;
@@ -6362,7 +6574,8 @@ static void split_free_pages(struct list_head *list)
list_for_each_entry_safe(page, next, &list[order], lru) {
int i;
- post_alloc_hook(page, order, __GFP_MOVABLE);
+ post_alloc_hook(page, order, gfp_mask);
+ set_page_refcounted(page);
if (!order)
continue;
@@ -6376,6 +6589,40 @@ static void split_free_pages(struct list_head *list)
}
}
+static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
+{
+ const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO;
+ const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+
+ /*
+ * We are given the range to allocate; node, mobility and placement
+ * hints are irrelevant at this point. We'll simply ignore them.
+ */
+ gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE |
+ __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE);
+
+ /*
+ * We only support most reclaim flags (but not NOFAIL/NORETRY), and
+ * selected action flags.
+ */
+ if (gfp_mask & ~(reclaim_mask | action_mask))
+ return -EINVAL;
+
+ /*
+ * Flags to control page compaction/migration/reclaim, to free up our
+ * page range. Migratable pages are movable, __GFP_MOVABLE is implied
+ * for them.
+ *
+ * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
+ * to not degrade callers.
+ */
+ *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) |
+ __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+ return 0;
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -6384,7 +6631,9 @@ static void split_free_pages(struct list_head *list)
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
- * @gfp_mask: GFP mask to use during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
*
* The PFN range does not have to be pageblock aligned. The PFN range must
* belong to a single zone.
@@ -6410,11 +6659,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
- .gfp_mask = current_gfp_context(gfp_mask),
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ gfp_mask = current_gfp_context(gfp_mask);
+ if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
+ return -EINVAL;
+
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -6436,7 +6688,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
+ ret = start_isolate_page_range(start, end, migratetype, 0);
if (ret)
goto done;
@@ -6455,7 +6707,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
if (ret && ret != -EBUSY)
goto done;
- ret = 0;
+
+ /*
+ * When in-use hugetlb pages are migrated, they may simply be released
+ * back into the free hugepage pool instead of being returned to the
+ * buddy system. After the migration of in-use huge pages is completed,
+ * we will invoke replace_free_hugepage_folios() to ensure that these
+ * hugepages are properly released to the buddy system.
+ */
+ ret = replace_free_hugepage_folios(start, end);
+ if (ret)
+ goto done;
/*
* Pages from [start, end) are within a pageblock_nr_pages
@@ -6489,7 +6751,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
}
if (!(gfp_mask & __GFP_COMP)) {
- split_free_pages(cc.freepages);
+ split_free_pages(cc.freepages, gfp_mask);
/* Free head and tail (if any) */
if (start != outer_start)
@@ -6502,6 +6764,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
+ set_page_refcounted(head);
} else {
ret = -EINVAL;
WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
@@ -6556,7 +6819,9 @@ static bool zone_spans_last_pfn(const struct zone *zone,
/**
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
* @nr_pages: Number of contiguous pages to allocate
- * @gfp_mask: GFP mask to limit search and used during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
@@ -6961,7 +7226,7 @@ static inline bool has_unaccepted_memory(void)
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
if (!has_unaccepted_memory())
@@ -6970,8 +7235,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
if (list_empty(&zone->unaccepted_pages))
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
@@ -7028,3 +7303,94 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+/**
+ * try_alloc_pages - opportunistic reentrant allocation from any context
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> try_alloc_pages_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure.
+ */
+struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that try_alloc_pages()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
+ | __GFP_ACCOUNT;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (has_unaccepted_memory())
+ return NULL;
+#endif
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (memcg_kmem_online() && page &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ free_pages_nolock(page, order);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}