summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c205
1 files changed, 91 insertions, 114 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d4b6f1a554e..111b54df8a3c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -31,7 +31,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
-#include <linux/pagevec.h>
+#include <linux/folio_batch.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmstat.h>
@@ -94,23 +94,6 @@ typedef int __bitwise fpi_t;
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
-/*
- * On SMP, spin_trylock is sufficient protection.
- * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
- * Pass flags to a no-op inline function to typecheck and silence the unused
- * variable warning.
- */
-static inline void __pcp_trylock_noop(unsigned long *flags) { }
-#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags))
-#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags))
-#else
-
-/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
-#define pcp_trylock_prepare(flags) local_irq_save(flags)
-#define pcp_trylock_finish(flags) local_irq_restore(flags)
-#endif
-
/*
* Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
* a migration causing the wrong PCP to be locked and remote memory being
@@ -128,71 +111,52 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { }
#endif
/*
- * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
- * Return value should be used with equivalent unlock helper.
+ * A helper to lookup and trylock pcp with embedded spinlock.
+ * The return value should be used with the unlock helper.
+ * NULL return value means the trylock failed.
*/
-#define pcpu_spin_trylock(type, member, ptr) \
+#ifdef CONFIG_SMP
+#define pcp_spin_trylock(ptr) \
({ \
- type *_ret; \
+ struct per_cpu_pages *_ret; \
pcpu_task_pin(); \
_ret = this_cpu_ptr(ptr); \
- if (!spin_trylock(&_ret->member)) { \
+ if (!spin_trylock(&_ret->lock)) { \
pcpu_task_unpin(); \
_ret = NULL; \
} \
_ret; \
})
-#define pcpu_spin_unlock(member, ptr) \
+#define pcp_spin_unlock(ptr) \
({ \
- spin_unlock(&ptr->member); \
+ spin_unlock(&ptr->lock); \
pcpu_task_unpin(); \
})
-/* struct per_cpu_pages specific helpers. */
-#define pcp_spin_trylock(ptr, UP_flags) \
-({ \
- struct per_cpu_pages *__ret; \
- pcp_trylock_prepare(UP_flags); \
- __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \
- if (!__ret) \
- pcp_trylock_finish(UP_flags); \
- __ret; \
-})
-
-#define pcp_spin_unlock(ptr, UP_flags) \
-({ \
- pcpu_spin_unlock(lock, ptr); \
- pcp_trylock_finish(UP_flags); \
-})
-
/*
- * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
- * a potentially remote cpu drain) and get interrupted by an operation that
- * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
- * spinlock assumptions making the trylock a no-op. So we have to turn that
- * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
- * remote cpu's so we can only be locking the only existing local one.
+ * On CONFIG_SMP=n the UP implementation of spin_trylock() never fails and thus
+ * is not compatible with our locking scheme. However we do not need pcp for
+ * scalability in the first place, so just make all the trylocks fail and take
+ * the slow path unconditionally.
*/
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
-static inline void __flags_noop(unsigned long *flags) { }
-#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
-({ \
- __flags_noop(&(flags)); \
- spin_lock(&(ptr)->lock); \
-})
-#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
-({ \
- spin_unlock(&(ptr)->lock); \
- __flags_noop(&(flags)); \
-})
#else
-#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
- spin_lock_irqsave(&(ptr)->lock, flags)
-#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
- spin_unlock_irqrestore(&(ptr)->lock, flags)
+#define pcp_spin_trylock(ptr) \
+ NULL
+
+#define pcp_spin_unlock(ptr) \
+ BUG_ON(1)
#endif
+/*
+ * In some cases we do not need to pin the task to the CPU because we are
+ * already given a specific cpu's pcp pointer.
+ */
+#define pcp_spin_lock_nopin(ptr) \
+ spin_lock(&(ptr)->lock)
+#define pcp_spin_unlock_nopin(ptr) \
+ spin_unlock(&(ptr)->lock)
+
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -243,6 +207,8 @@ unsigned int pageblock_order __read_mostly;
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags);
+static void reserve_highatomic_pageblock(struct page *page, int order,
+ struct zone *zone);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -687,7 +653,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
- VM_BUG_ON(order != HPAGE_PMD_ORDER);
+ VM_BUG_ON(!is_pmd_order(order));
movable = migratetype == MIGRATE_MOVABLE;
@@ -719,7 +685,7 @@ static inline bool pcp_allowed_order(unsigned int order)
if (order <= PAGE_ALLOC_COSTLY_ORDER)
return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order == HPAGE_PMD_ORDER)
+ if (is_pmd_order(order))
return true;
#endif
return false;
@@ -731,7 +697,7 @@ static inline bool pcp_allowed_order(unsigned int order)
* The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
- * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
+ * in bit 0 of page->compound_info. The rest of bits is pointer to head page.
*
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
@@ -744,7 +710,7 @@ void prep_compound_page(struct page *page, unsigned int order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++)
- prep_compound_tail(page, i);
+ prep_compound_tail(page + i, page, order);
prep_compound_head(page, order);
}
@@ -1079,7 +1045,6 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
- page_pool_page_is_pp(page) |
(page->flags.f & check_flags)))
return false;
@@ -1106,8 +1071,6 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(page_pool_page_is_pp(page)))
- bad_reason = "page_pool leak";
return bad_reason;
}
@@ -1416,9 +1379,17 @@ __always_inline bool __free_pages_prepare(struct page *page,
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
folio->mapping = NULL;
}
- if (unlikely(page_has_type(page)))
+ if (unlikely(page_has_type(page))) {
+ /* networking expects to clear its page type before releasing */
+ if (is_check_pages_enabled()) {
+ if (unlikely(PageNetpp(page))) {
+ bad_page(page, "page_pool leak");
+ return false;
+ }
+ }
/* Reset the page_type (which overlays _mapcount) */
page->page_type = UINT_MAX;
+ }
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
@@ -2588,7 +2559,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
int high_min, to_drain, to_drain_batched, batch;
- unsigned long UP_flags;
bool todo = false;
high_min = READ_ONCE(pcp->high_min);
@@ -2608,9 +2578,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
to_drain = pcp->count - pcp->high;
while (to_drain > 0) {
to_drain_batched = min(to_drain, batch);
- pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ pcp_spin_lock_nopin(pcp);
free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
- pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ pcp_spin_unlock_nopin(pcp);
todo = true;
to_drain -= to_drain_batched;
@@ -2627,15 +2597,14 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- unsigned long UP_flags;
int to_drain, batch;
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
- pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ pcp_spin_lock_nopin(pcp);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ pcp_spin_unlock_nopin(pcp);
}
}
#endif
@@ -2646,11 +2615,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- unsigned long UP_flags;
int count;
do {
- pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ pcp_spin_lock_nopin(pcp);
count = pcp->count;
if (count) {
int to_drain = min(count,
@@ -2659,7 +2627,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
free_pcppages_bulk(zone, to_drain, pcp, 0);
count -= to_drain;
}
- pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ pcp_spin_unlock_nopin(pcp);
} while (count);
}
@@ -2858,7 +2826,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
*/
static bool free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
- unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int to_free, to_free_batched;
@@ -2918,9 +2886,9 @@ static bool free_frozen_page_commit(struct zone *zone,
if (to_free == 0 || pcp->count == 0)
break;
- pcp_spin_unlock(pcp, *UP_flags);
+ pcp_spin_unlock(pcp);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp) {
ret = false;
break;
@@ -2932,7 +2900,7 @@ static bool free_frozen_page_commit(struct zone *zone,
* returned in an unlocked state.
*/
if (smp_processor_id() != cpu) {
- pcp_spin_unlock(pcp, *UP_flags);
+ pcp_spin_unlock(pcp);
ret = false;
break;
}
@@ -2964,7 +2932,6 @@ static bool free_frozen_page_commit(struct zone *zone,
static void __free_frozen_pages(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
- unsigned long UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
@@ -3000,12 +2967,12 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
add_page_to_zone_llist(zone, page, order);
return;
}
- pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
if (!free_frozen_page_commit(zone, pcp, page, migratetype,
- order, fpi_flags, &UP_flags))
+ order, fpi_flags))
return;
- pcp_spin_unlock(pcp, UP_flags);
+ pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, fpi_flags);
}
@@ -3026,7 +2993,6 @@ void free_frozen_pages_nolock(struct page *page, unsigned int order)
*/
void free_unref_folios(struct folio_batch *folios)
{
- unsigned long UP_flags;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
int i, j;
@@ -3069,7 +3035,7 @@ void free_unref_folios(struct folio_batch *folios)
if (zone != locked_zone ||
is_migrate_isolate(migratetype)) {
if (pcp) {
- pcp_spin_unlock(pcp, UP_flags);
+ pcp_spin_unlock(pcp);
locked_zone = NULL;
pcp = NULL;
}
@@ -3088,7 +3054,7 @@ void free_unref_folios(struct folio_batch *folios)
* trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
- pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (unlikely(!pcp)) {
free_one_page(zone, &folio->page, pfn,
order, FPI_NONE);
@@ -3106,14 +3072,14 @@ void free_unref_folios(struct folio_batch *folios)
trace_mm_page_free_batched(&folio->page);
if (!free_frozen_page_commit(zone, pcp, &folio->page,
- migratetype, order, FPI_NONE, &UP_flags)) {
+ migratetype, order, FPI_NONE)) {
pcp = NULL;
locked_zone = NULL;
}
}
if (pcp)
- pcp_spin_unlock(pcp, UP_flags);
+ pcp_spin_unlock(pcp);
folio_batch_reinit(folios);
}
@@ -3275,6 +3241,13 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
spin_unlock_irqrestore(&zone->lock, flags);
} while (check_new_pages(page, order));
+ /*
+ * If this is a high-order atomic allocation then check
+ * if the pageblock should be reserved for the future
+ */
+ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
+ reserve_highatomic_pageblock(page, order, zone);
+
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -3346,6 +3319,20 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
int batch = nr_pcp_alloc(pcp, zone, order);
int alloced;
+ /*
+ * Don't refill the list for a higher order atomic
+ * allocation under memory pressure, as this would
+ * not build up any HIGHATOMIC reserves, which
+ * might be needed soon.
+ *
+ * Instead, direct it towards the reserves by
+ * returning NULL, which will make the caller fall
+ * back to rmqueue_buddy. This will try to use the
+ * reserves first and grow them if needed.
+ */
+ if (alloc_flags & ALLOC_HIGHATOMIC)
+ return NULL;
+
alloced = rmqueue_bulk(zone, order,
batch, list,
migratetype, alloc_flags);
@@ -3371,10 +3358,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
- unsigned long UP_flags;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp)
return NULL;
@@ -3386,7 +3372,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- pcp_spin_unlock(pcp, UP_flags);
+ pcp_spin_unlock(pcp);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -3961,13 +3947,6 @@ try_this_zone:
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
- /*
- * If this is a high-order atomic allocation then check
- * if the pageblock should be reserved for the future
- */
- if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
- reserve_highatomic_pageblock(page, order, zone);
-
return page;
} else {
if (cond_accept_memory(zone, order, alloc_flags))
@@ -5067,7 +5046,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct page **page_array)
{
struct page *page;
- unsigned long UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -5136,7 +5114,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
- mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages - nr_populated;
if (zone_watermark_fast(zone, 0, mark,
zonelist_zone_idx(ac.preferred_zoneref),
alloc_flags, gfp)) {
@@ -5161,7 +5139,7 @@ retry_this_zone:
goto failed;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (!pcp)
goto failed;
@@ -5180,7 +5158,7 @@ retry_this_zone:
if (unlikely(!page)) {
/* Try and allocate at least one page */
if (!nr_account) {
- pcp_spin_unlock(pcp, UP_flags);
+ pcp_spin_unlock(pcp);
goto failed;
}
break;
@@ -5192,7 +5170,7 @@ retry_this_zone:
page_array[nr_populated++] = page;
}
- pcp_spin_unlock(pcp, UP_flags);
+ pcp_spin_unlock(pcp);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -6147,7 +6125,6 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{
struct per_cpu_pages *pcp;
struct cpu_cacheinfo *cci;
- unsigned long UP_flags;
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
cci = get_cpu_cacheinfo(cpu);
@@ -6158,12 +6135,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
* This can reduce zone lock contention without hurting
* cache-hot pages sharing.
*/
- pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ pcp_spin_lock_nopin(pcp);
if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
pcp->flags |= PCPF_FREE_HIGH_BATCH;
else
pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
- pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ pcp_spin_unlock_nopin(pcp);
}
void setup_pcp_cacheinfo(unsigned int cpu)
@@ -6553,8 +6530,8 @@ void calculate_min_free_kbytes(void)
if (new_min_free_kbytes > user_min_free_kbytes)
min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
else
- pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
- new_min_free_kbytes, user_min_free_kbytes);
+ pr_warn_ratelimited("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+ new_min_free_kbytes, user_min_free_kbytes);
}